mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
45 Commits
deploy-030
...
deploy-031
Author | SHA1 | Date | |
---|---|---|---|
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 |
@@ -105,8 +105,6 @@ public enum HtmlFeature {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getFeatureBit() {
|
public int getFeatureBit() {
|
||||||
if (getClass().desiredAssertionStatus() && ordinal() >= 32)
|
|
||||||
throw new IllegalStateException("Attempting to extract feature bit of " + name() + ", with ordinal " + ordinal());
|
|
||||||
return (1<< ordinal());
|
return (1<< ordinal());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -304,7 +304,6 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
||||||
specs.getDomainsList(),
|
specs.getDomainsList(),
|
||||||
specs.getSearchSetIdentifier(),
|
specs.getSearchSetIdentifier(),
|
||||||
specs.getHumanQuery(),
|
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||||
|
@@ -18,8 +18,6 @@ public class SearchSpecification {
|
|||||||
|
|
||||||
public String searchSetIdentifier;
|
public String searchSetIdentifier;
|
||||||
|
|
||||||
public final String humanQuery;
|
|
||||||
|
|
||||||
public SpecificationLimit quality;
|
public SpecificationLimit quality;
|
||||||
public SpecificationLimit year;
|
public SpecificationLimit year;
|
||||||
public SpecificationLimit size;
|
public SpecificationLimit size;
|
||||||
@@ -35,7 +33,6 @@ public class SearchSpecification {
|
|||||||
public SearchSpecification(SearchQuery query,
|
public SearchSpecification(SearchQuery query,
|
||||||
List<Integer> domains,
|
List<Integer> domains,
|
||||||
String searchSetIdentifier,
|
String searchSetIdentifier,
|
||||||
String humanQuery,
|
|
||||||
SpecificationLimit quality,
|
SpecificationLimit quality,
|
||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
@@ -47,7 +44,6 @@ public class SearchSpecification {
|
|||||||
this.query = query;
|
this.query = query;
|
||||||
this.domains = domains;
|
this.domains = domains;
|
||||||
this.searchSetIdentifier = searchSetIdentifier;
|
this.searchSetIdentifier = searchSetIdentifier;
|
||||||
this.humanQuery = humanQuery;
|
|
||||||
this.quality = quality;
|
this.quality = quality;
|
||||||
this.year = year;
|
this.year = year;
|
||||||
this.size = size;
|
this.size = size;
|
||||||
@@ -73,10 +69,6 @@ public class SearchSpecification {
|
|||||||
return this.searchSetIdentifier;
|
return this.searchSetIdentifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getHumanQuery() {
|
|
||||||
return this.humanQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SpecificationLimit getQuality() {
|
public SpecificationLimit getQuality() {
|
||||||
return this.quality;
|
return this.quality;
|
||||||
}
|
}
|
||||||
@@ -106,14 +98,13 @@ public class SearchSpecification {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class SearchSpecificationBuilder {
|
public static class SearchSpecificationBuilder {
|
||||||
private SearchQuery query;
|
private SearchQuery query;
|
||||||
private List<Integer> domains;
|
private List<Integer> domains;
|
||||||
private String searchSetIdentifier;
|
private String searchSetIdentifier;
|
||||||
private String humanQuery;
|
|
||||||
private SpecificationLimit quality$value;
|
private SpecificationLimit quality$value;
|
||||||
private boolean quality$set;
|
private boolean quality$set;
|
||||||
private SpecificationLimit year$value;
|
private SpecificationLimit year$value;
|
||||||
@@ -144,11 +135,6 @@ public class SearchSpecification {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSpecificationBuilder humanQuery(String humanQuery) {
|
|
||||||
this.humanQuery = humanQuery;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
||||||
this.quality$value = quality;
|
this.quality$value = quality;
|
||||||
this.quality$set = true;
|
this.quality$set = true;
|
||||||
@@ -205,11 +191,7 @@ public class SearchSpecification {
|
|||||||
if (!this.rank$set) {
|
if (!this.rank$set) {
|
||||||
rank$value = SpecificationLimit.none();
|
rank$value = SpecificationLimit.none();
|
||||||
}
|
}
|
||||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,56 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
|
||||||
|
|
||||||
import java.util.BitSet;
|
|
||||||
|
|
||||||
public class ResultRankingContext {
|
|
||||||
private final int docCount;
|
|
||||||
public final RpcResultRankingParameters params;
|
|
||||||
|
|
||||||
|
|
||||||
public final BitSet regularMask;
|
|
||||||
public final BitSet ngramsMask;
|
|
||||||
|
|
||||||
/** CqDataInt associated with frequency information of the terms in the query
|
|
||||||
* in the full index. The dataset is indexed by the compiled query. */
|
|
||||||
public final CqDataInt fullCounts;
|
|
||||||
|
|
||||||
/** CqDataInt associated with frequency information of the terms in the query
|
|
||||||
* in the full index. The dataset is indexed by the compiled query. */
|
|
||||||
public final CqDataInt priorityCounts;
|
|
||||||
|
|
||||||
public ResultRankingContext(int docCount,
|
|
||||||
RpcResultRankingParameters params,
|
|
||||||
BitSet ngramsMask,
|
|
||||||
BitSet regularMask,
|
|
||||||
CqDataInt fullCounts,
|
|
||||||
CqDataInt prioCounts)
|
|
||||||
{
|
|
||||||
this.docCount = docCount;
|
|
||||||
this.params = params;
|
|
||||||
|
|
||||||
this.ngramsMask = ngramsMask;
|
|
||||||
this.regularMask = regularMask;
|
|
||||||
|
|
||||||
this.fullCounts = fullCounts;
|
|
||||||
this.priorityCounts = prioCounts;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int termFreqDocCount() {
|
|
||||||
return docCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "ResultRankingContext{" +
|
|
||||||
"docCount=" + docCount +
|
|
||||||
", params=" + params +
|
|
||||||
", regularMask=" + regularMask +
|
|
||||||
", ngramsMask=" + ngramsMask +
|
|
||||||
", fullCounts=" + fullCounts +
|
|
||||||
", priorityCounts=" + priorityCounts +
|
|
||||||
'}';
|
|
||||||
}
|
|
||||||
}
|
|
@@ -34,8 +34,6 @@ public class QueryFactory {
|
|||||||
this.queryExpansion = queryExpansion;
|
this.queryExpansion = queryExpansion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public ProcessedQuery createQuery(QueryParams params,
|
public ProcessedQuery createQuery(QueryParams params,
|
||||||
@Nullable RpcResultRankingParameters rankingParams) {
|
@Nullable RpcResultRankingParameters rankingParams) {
|
||||||
final var query = params.humanQuery();
|
final var query = params.humanQuery();
|
||||||
@@ -153,7 +151,6 @@ public class QueryFactory {
|
|||||||
|
|
||||||
var specsBuilder = SearchSpecification.builder()
|
var specsBuilder = SearchSpecification.builder()
|
||||||
.query(queryBuilder.build())
|
.query(queryBuilder.build())
|
||||||
.humanQuery(query)
|
|
||||||
.quality(qualityLimit)
|
.quality(qualityLimit)
|
||||||
.year(year)
|
.year(year)
|
||||||
.size(size)
|
.size(size)
|
||||||
|
@@ -241,7 +241,6 @@ public class QueryFactoryTest {
|
|||||||
|
|
||||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
||||||
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
||||||
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@@ -1,9 +1,10 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -22,16 +23,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
|||||||
* and a mapping between document identifiers to the index into the
|
* and a mapping between document identifiers to the index into the
|
||||||
* data array.
|
* data array.
|
||||||
* <p/>
|
* <p/>
|
||||||
* Since the total data is relatively small, this is kept in memory to
|
|
||||||
* reduce the amount of disk thrashing.
|
|
||||||
* <p/>
|
|
||||||
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
||||||
*/
|
*/
|
||||||
public class ForwardIndexReader {
|
public class ForwardIndexReader {
|
||||||
private final LongArray ids;
|
private final LongArray ids;
|
||||||
private final LongArray data;
|
private final LongArray data;
|
||||||
|
|
||||||
private final ForwardIndexSpansReader spansReader;
|
private volatile Long2IntOpenHashMap idsMap;
|
||||||
|
|
||||||
|
private final IndexSpansReader spansReader;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@@ -64,7 +64,18 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
ids = loadIds(idsFile);
|
ids = loadIds(idsFile);
|
||||||
data = loadData(dataFile);
|
data = loadData(dataFile);
|
||||||
spansReader = new ForwardIndexSpansReader(spansFile);
|
|
||||||
|
spansReader = IndexSpansReader.open(spansFile);
|
||||||
|
|
||||||
|
Thread.ofPlatform().start(this::createIdsMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createIdsMap() {
|
||||||
|
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
|
||||||
|
for (int i = 0; i < ids.size(); i++) {
|
||||||
|
idsMap.put(ids.get(i), i);
|
||||||
|
}
|
||||||
|
this.idsMap = idsMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static LongArray loadIds(Path idsFile) throws IOException {
|
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||||
@@ -106,6 +117,10 @@ public class ForwardIndexReader {
|
|||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
|
if (idsMap != null) {
|
||||||
|
return idsMap.getOrDefault(docId, -1);
|
||||||
|
}
|
||||||
|
|
||||||
long offset = ids.binarySearch(docId, 0, ids.size());
|
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||||
|
|
||||||
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||||
@@ -134,6 +149,27 @@ public class ForwardIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
|
||||||
|
long[] offsets = new long[docIds.length];
|
||||||
|
for (int i = 0; i < docIds.length; i++) {
|
||||||
|
long offset = idxForDoc(docIds[i]);
|
||||||
|
if (offset >= 0) {
|
||||||
|
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
offsets[i] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return spansReader.readSpans(arena, offsets);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read spans for docIds", ex);
|
||||||
|
return new DocumentSpans[docIds.length];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return (int) ids.size();
|
return (int) ids.size();
|
||||||
}
|
}
|
||||||
@@ -141,6 +177,8 @@ public class ForwardIndexReader {
|
|||||||
public void close() {
|
public void close() {
|
||||||
if (data != null)
|
if (data != null)
|
||||||
data.close();
|
data.close();
|
||||||
|
if (ids != null)
|
||||||
|
ids.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isLoaded() {
|
public boolean isLoaded() {
|
||||||
|
@@ -5,7 +5,7 @@ import nu.marginalia.array.LongArray;
|
|||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexParameters;
|
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
|
|||||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||||
|
|
||||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
|
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
|
||||||
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
|
var spansWriter = new IndexSpansWriter(outputFileSpansData)
|
||||||
) {
|
) {
|
||||||
progress.progress(TaskSteps.GET_DOC_IDS);
|
progress.progress(TaskSteps.GET_DOC_IDS);
|
||||||
|
|
||||||
|
@@ -11,6 +11,9 @@ public class DocumentSpan {
|
|||||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||||
private final IntList startsEnds;
|
private final IntList startsEnds;
|
||||||
|
|
||||||
|
public DocumentSpan(IntList startsEnds) {
|
||||||
|
this.startsEnds = startsEnds;
|
||||||
|
}
|
||||||
public DocumentSpan(CodedSequence startsEnds) {
|
public DocumentSpan(CodedSequence startsEnds) {
|
||||||
this.startsEnds = startsEnds.values();
|
this.startsEnds = startsEnds.values();
|
||||||
}
|
}
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.sequence.CodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
@@ -39,6 +40,23 @@ public class DocumentSpans {
|
|||||||
return EMPTY_SPAN;
|
return EMPTY_SPAN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void accept(byte code, IntList positions) {
|
||||||
|
if (code == HtmlTag.HEADING.code)
|
||||||
|
this.heading = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.TITLE.code)
|
||||||
|
this.title = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.NAV.code)
|
||||||
|
this.nav = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.CODE.code)
|
||||||
|
this.code = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.ANCHOR.code)
|
||||||
|
this.anchor = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
|
||||||
|
this.externalLinkText = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.BODY.code)
|
||||||
|
this.body = new DocumentSpan(positions);
|
||||||
|
}
|
||||||
|
|
||||||
void accept(byte code, CodedSequence positions) {
|
void accept(byte code, CodedSequence positions) {
|
||||||
if (code == HtmlTag.HEADING.code)
|
if (code == HtmlTag.HEADING.code)
|
||||||
this.heading = new DocumentSpan(positions);
|
this.heading = new DocumentSpan(positions);
|
||||||
|
@@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public interface IndexSpansReader extends AutoCloseable {
|
||||||
|
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
|
||||||
|
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
|
||||||
|
|
||||||
|
static IndexSpansReader open(Path fileName) throws IOException {
|
||||||
|
int version = SpansCodec.parseSpanFilesFooter(fileName);
|
||||||
|
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
|
||||||
|
return new IndexSpansReaderCompressed(fileName);
|
||||||
|
}
|
||||||
|
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
|
||||||
|
return new IndexSpansReaderPlain(fileName);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new IllegalArgumentException("Unsupported spans file version: " + version);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void close() throws IOException;
|
||||||
|
}
|
@@ -10,11 +10,11 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
@SuppressWarnings("preview")
|
@Deprecated
|
||||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
|
||||||
private final FileChannel spansFileChannel;
|
private final FileChannel spansFileChannel;
|
||||||
|
|
||||||
public ForwardIndexSpansReader(Path spansFile) throws IOException {
|
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
|
||||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,6 +51,17 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||||
|
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||||
|
for (int i = 0; i < encodedOffsets.length; i++) {
|
||||||
|
if (encodedOffsets[i] >= 0) {
|
||||||
|
ret[i] = readSpans(arena, encodedOffsets[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
spansFileChannel.close();
|
spansFileChannel.close();
|
@@ -0,0 +1,122 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.lang.foreign.MemorySegment;
|
||||||
|
import java.lang.foreign.ValueLayout;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
|
||||||
|
public class IndexSpansReaderPlain implements IndexSpansReader {
|
||||||
|
private final FileChannel[] spansFileChannels;
|
||||||
|
private final ForkJoinPool forkJoinPool;
|
||||||
|
|
||||||
|
public IndexSpansReaderPlain(Path spansFile) throws IOException {
|
||||||
|
this.spansFileChannels = new FileChannel[8];
|
||||||
|
for (int i = 0; i < spansFileChannels.length; i++) {
|
||||||
|
spansFileChannels[i] = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||||
|
}
|
||||||
|
forkJoinPool = new ForkJoinPool(spansFileChannels.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||||
|
// Decode the size and offset from the encoded offset
|
||||||
|
long size = SpansCodec.decodeSize(encodedOffset);
|
||||||
|
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||||
|
|
||||||
|
var ms = arena.allocate(size, 4);
|
||||||
|
// Allocate a buffer from the arena
|
||||||
|
var buffer = ms.asByteBuffer();
|
||||||
|
while (buffer.hasRemaining()) {
|
||||||
|
spansFileChannels[0].read(buffer, offset + buffer.position());
|
||||||
|
}
|
||||||
|
|
||||||
|
return decode(ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentSpans decode(MemorySegment ms) {
|
||||||
|
int count = ms.get(ValueLayout.JAVA_INT, 0);
|
||||||
|
int pos = 4;
|
||||||
|
DocumentSpans ret = new DocumentSpans();
|
||||||
|
|
||||||
|
// Decode each span
|
||||||
|
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
|
||||||
|
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
|
||||||
|
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
|
||||||
|
|
||||||
|
IntArrayList values = new IntArrayList(len);
|
||||||
|
|
||||||
|
pos += 4;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
|
||||||
|
}
|
||||||
|
ret.accept(code, values);
|
||||||
|
pos += 4*len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||||
|
long totalSize = 0;
|
||||||
|
int numJobs = 0;
|
||||||
|
for (long offset : encodedOffsets) {
|
||||||
|
if (offset < 0)
|
||||||
|
continue;
|
||||||
|
totalSize += SpansCodec.decodeSize(offset);
|
||||||
|
numJobs++;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||||
|
if (numJobs == 0) return ret;
|
||||||
|
|
||||||
|
CountDownLatch latch = new CountDownLatch(numJobs);
|
||||||
|
MemorySegment segment = arena.allocate(totalSize, 8);
|
||||||
|
|
||||||
|
long bufferOffset = 0;
|
||||||
|
for (int idx = 0; idx < encodedOffsets.length; idx++) {
|
||||||
|
long size = SpansCodec.decodeSize(encodedOffsets[idx]);
|
||||||
|
long start = SpansCodec.decodeStartOffset(encodedOffsets[idx]);
|
||||||
|
|
||||||
|
MemorySegment slice = segment.asSlice(bufferOffset, size);
|
||||||
|
bufferOffset += size;
|
||||||
|
|
||||||
|
int i = idx;
|
||||||
|
forkJoinPool.execute(() -> {
|
||||||
|
var buffer = slice.asByteBuffer();
|
||||||
|
try {
|
||||||
|
spansFileChannels[i% spansFileChannels.length].read(buffer, start);
|
||||||
|
ret[i] = decode(slice);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
latch.countDown();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
latch.await();
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
for (var spansFileChannel : spansFileChannels) {
|
||||||
|
spansFileChannel.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,20 +1,23 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class ForwardIndexSpansWriter implements AutoCloseable {
|
public class IndexSpansWriter implements AutoCloseable {
|
||||||
private final FileChannel outputChannel;
|
private final FileChannel outputChannel;
|
||||||
private final ByteBuffer work = ByteBuffer.allocate(32);
|
private final ByteBuffer work = ByteBuffer.allocate(65536).order(ByteOrder.nativeOrder());
|
||||||
|
|
||||||
private long stateStartOffset = -1;
|
private long stateStartOffset = -1;
|
||||||
private int stateLength = -1;
|
private int stateLength = -1;
|
||||||
|
|
||||||
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
|
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
|
||||||
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
|
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,7 +26,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
stateLength = 0;
|
stateLength = 0;
|
||||||
|
|
||||||
work.clear();
|
work.clear();
|
||||||
work.put((byte) count);
|
work.putInt(count);
|
||||||
work.flip();
|
work.flip();
|
||||||
|
|
||||||
while (work.hasRemaining())
|
while (work.hasRemaining())
|
||||||
@@ -33,12 +36,17 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
|
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
|
||||||
work.clear();
|
work.clear();
|
||||||
work.put(spanCode);
|
work.put(spanCode);
|
||||||
work.putShort((short) sequenceData.remaining());
|
work.put((byte) 0); // Ensure we're byte aligned
|
||||||
|
var sequence = new VarintCodedSequence(sequenceData);
|
||||||
|
work.putShort((short) sequence.valueCount());
|
||||||
|
|
||||||
|
var iter = sequence.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
work.putInt(iter.nextInt());
|
||||||
|
}
|
||||||
work.flip();
|
work.flip();
|
||||||
|
|
||||||
while (work.hasRemaining() || sequenceData.hasRemaining()) {
|
stateLength += outputChannel.write(work);
|
||||||
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public long endRecord() {
|
public long endRecord() {
|
||||||
@@ -47,6 +55,11 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN);
|
||||||
|
outputChannel.position(outputChannel.size());
|
||||||
|
while (footer.hasRemaining()) {
|
||||||
|
outputChannel.write(footer, outputChannel.size());
|
||||||
|
}
|
||||||
outputChannel.close();
|
outputChannel.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -1,6 +1,21 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class SpansCodec {
|
public class SpansCodec {
|
||||||
|
public static int MAGIC_INT = 0xF000F000;
|
||||||
|
public static int FOOTER_SIZE = 8;
|
||||||
|
|
||||||
|
enum SpansCodecVersion {
|
||||||
|
@Deprecated
|
||||||
|
COMPRESSED,
|
||||||
|
PLAIN
|
||||||
|
}
|
||||||
|
|
||||||
public static long encode(long startOffset, long size) {
|
public static long encode(long startOffset, long size) {
|
||||||
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
||||||
|
|
||||||
@@ -14,4 +29,31 @@ public class SpansCodec {
|
|||||||
public static long decodeSize(long encoded) {
|
public static long decodeSize(long encoded) {
|
||||||
return encoded & 0x0FFF_FFFFL;
|
return encoded & 0x0FFF_FFFFL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version) {
|
||||||
|
ByteBuffer footer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||||
|
footer.putInt(SpansCodec.MAGIC_INT);
|
||||||
|
footer.put((byte) version.ordinal());
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.flip();
|
||||||
|
return footer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
|
||||||
|
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||||
|
|
||||||
|
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
|
||||||
|
if (fc.size() < FOOTER_SIZE) return 0;
|
||||||
|
fc.read(buffer, fc.size() - buffer.capacity());
|
||||||
|
buffer.flip();
|
||||||
|
int magic = buffer.getInt();
|
||||||
|
if (magic != MAGIC_INT) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return buffer.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
import nu.marginalia.index.forward.spans.IndexSpansReaderPlain;
|
||||||
|
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.sequence.VarintCodedSequence;
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@@ -17,10 +18,10 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class ForwardIndexSpansReaderTest {
|
class IndexSpansReaderTest {
|
||||||
Path testFile = Files.createTempFile("test", ".idx");
|
Path testFile = Files.createTempFile("test", ".idx");
|
||||||
|
|
||||||
ForwardIndexSpansReaderTest() throws IOException {
|
IndexSpansReaderTest() throws IOException {
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -34,7 +35,7 @@ class ForwardIndexSpansReaderTest {
|
|||||||
|
|
||||||
long offset1;
|
long offset1;
|
||||||
long offset2;
|
long offset2;
|
||||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
try (var writer = new IndexSpansWriter(testFile)) {
|
||||||
writer.beginRecord(1);
|
writer.beginRecord(1);
|
||||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer());
|
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer());
|
||||||
offset1 = writer.endRecord();
|
offset1 = writer.endRecord();
|
||||||
@@ -46,7 +47,7 @@ class ForwardIndexSpansReaderTest {
|
|||||||
offset2 = writer.endRecord();
|
offset2 = writer.endRecord();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
try (var reader = IndexSpansReader.open(testFile);
|
||||||
var arena = Arena.ofConfined()
|
var arena = Arena.ofConfined()
|
||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
@@ -77,13 +78,13 @@ class ForwardIndexSpansReaderTest {
|
|||||||
@Test
|
@Test
|
||||||
void testContainsRange() throws IOException {
|
void testContainsRange() throws IOException {
|
||||||
long offset1;
|
long offset1;
|
||||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
try (var writer = new IndexSpansWriter(testFile)) {
|
||||||
writer.beginRecord(1);
|
writer.beginRecord(1);
|
||||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||||
offset1 = writer.endRecord();
|
offset1 = writer.endRecord();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||||
var arena = Arena.ofConfined()
|
var arena = Arena.ofConfined()
|
||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
@@ -104,13 +105,13 @@ class ForwardIndexSpansReaderTest {
|
|||||||
@Test
|
@Test
|
||||||
void testContainsRangeExact() throws IOException {
|
void testContainsRangeExact() throws IOException {
|
||||||
long offset1;
|
long offset1;
|
||||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
try (var writer = new IndexSpansWriter(testFile)) {
|
||||||
writer.beginRecord(1);
|
writer.beginRecord(1);
|
||||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||||
offset1 = writer.endRecord();
|
offset1 = writer.endRecord();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||||
var arena = Arena.ofConfined()
|
var arena = Arena.ofConfined()
|
||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
||||||
@@ -131,13 +132,13 @@ class ForwardIndexSpansReaderTest {
|
|||||||
@Test
|
@Test
|
||||||
void testCountRangeMatches() throws IOException {
|
void testCountRangeMatches() throws IOException {
|
||||||
long offset1;
|
long offset1;
|
||||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
try (var writer = new IndexSpansWriter(testFile)) {
|
||||||
writer.beginRecord(1);
|
writer.beginRecord(1);
|
||||||
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate( 1, 2, 10, 15, 20, 25).buffer());
|
||||||
offset1 = writer.endRecord();
|
offset1 = writer.endRecord();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new ForwardIndexSpansReader(testFile);
|
try (var reader = new IndexSpansReaderPlain(testFile);
|
||||||
var arena = Arena.ofConfined()
|
var arena = Arena.ofConfined()
|
||||||
) {
|
) {
|
||||||
var spans1 = reader.readSpans(arena, offset1);
|
var spans1 = reader.readSpans(arena, offset1);
|
53
code/index/index-perftest/build.gradle
Normal file
53
code/index/index-perftest/build.gradle
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.index.perftest.PerfTestMain'
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
|
||||||
|
implementation project(':code:libraries:array')
|
||||||
|
implementation project(':code:libraries:btree')
|
||||||
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
implementation project(':code:common:linkdb')
|
||||||
|
implementation project(':code:index')
|
||||||
|
implementation project(':code:index:query')
|
||||||
|
implementation project(':code:index:index-forward')
|
||||||
|
implementation project(':code:index:index-reverse')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
implementation project(':code:functions:search-query')
|
||||||
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
|
implementation libs.slop
|
||||||
|
implementation libs.roaringbitmap
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.guava
|
||||||
|
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.bundles.gson
|
||||||
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,334 @@
|
|||||||
|
package nu.marginalia.index.perftest;
|
||||||
|
|
||||||
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
|
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||||
|
import nu.marginalia.index.FullReverseIndexReader;
|
||||||
|
import nu.marginalia.index.IndexQueryExecution;
|
||||||
|
import nu.marginalia.index.PrioReverseIndexReader;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
|
import nu.marginalia.index.results.DomainRankingOverrides;
|
||||||
|
import nu.marginalia.index.results.IndexResultRankingService;
|
||||||
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
|
import nu.marginalia.index.searchset.SearchSetAny;
|
||||||
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class PerfTestMain {
|
||||||
|
static Duration warmupTime = Duration.ofMinutes(1);
|
||||||
|
static Duration runTime = Duration.ofMinutes(10);
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
if (args.length != 4) {
|
||||||
|
System.err.println("Arguments: home-dir index-dir query");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Path indexDir = Paths.get(args[0]);
|
||||||
|
if (!Files.isDirectory(indexDir)) {
|
||||||
|
System.err.println("Index directory is not a directory");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
Path homeDir = Paths.get(args[1]);
|
||||||
|
String scenario = args[2];
|
||||||
|
String query = args[3];
|
||||||
|
|
||||||
|
switch (scenario) {
|
||||||
|
case "valuation" -> runValuation(indexDir, homeDir, query);
|
||||||
|
case "lookup" -> runLookup(indexDir, homeDir, query);
|
||||||
|
case "execution" -> runExecution(indexDir, homeDir, query);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (NumberFormatException e) {
|
||||||
|
System.err.println("Arguments: data-dir index-dir query");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
System.err.println("Error during testing");
|
||||||
|
ex.printStackTrace();
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
System.out.println(Arrays.toString(args));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CombinedIndexReader createCombinedIndexReader(Path indexDir) throws IOException {
|
||||||
|
|
||||||
|
return new CombinedIndexReader(
|
||||||
|
new ForwardIndexReader(
|
||||||
|
indexDir.resolve("ir/fwd-doc-id.dat"),
|
||||||
|
indexDir.resolve("ir/fwd-doc-data.dat"),
|
||||||
|
indexDir.resolve("ir/fwd-spans.dat")
|
||||||
|
),
|
||||||
|
new FullReverseIndexReader(
|
||||||
|
"full",
|
||||||
|
indexDir.resolve("ir/rev-words.dat"),
|
||||||
|
indexDir.resolve("ir/rev-docs.dat"),
|
||||||
|
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
|
||||||
|
),
|
||||||
|
new PrioReverseIndexReader(
|
||||||
|
"prio",
|
||||||
|
indexDir.resolve("ir/rev-prio-words.dat"),
|
||||||
|
indexDir.resolve("ir/rev-prio-docs.dat")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IndexResultRankingService createIndexResultRankingService(Path indexDir, CombinedIndexReader combinedIndexReader) throws IOException, SQLException {
|
||||||
|
return new IndexResultRankingService(
|
||||||
|
new DocumentDbReader(indexDir.resolve("ldbr/documents.db")),
|
||||||
|
new StatefulIndex(combinedIndexReader),
|
||||||
|
new DomainRankingOverrides(null, Path.of("xxxx"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static QueryFactory createQueryFactory(Path homeDir) throws IOException {
|
||||||
|
return new QueryFactory(
|
||||||
|
new QueryExpansion(
|
||||||
|
new TermFrequencyDict(homeDir.resolve("model/tfreq-new-algo3.bin")),
|
||||||
|
new NgramLexicon()
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void runValuation(Path homeDir,
|
||||||
|
Path indexDir,
|
||||||
|
String rawQuery) throws IOException, SQLException
|
||||||
|
{
|
||||||
|
|
||||||
|
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||||
|
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||||
|
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
|
||||||
|
|
||||||
|
var queryLimits = RpcQueryLimits.newBuilder()
|
||||||
|
.setTimeoutMs(10_000)
|
||||||
|
.setResultsTotal(1000)
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setFetchSize(4096)
|
||||||
|
.build();
|
||||||
|
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||||
|
|
||||||
|
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||||
|
|
||||||
|
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||||
|
|
||||||
|
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||||
|
|
||||||
|
TLongArrayList allResults = new TLongArrayList();
|
||||||
|
LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||||
|
|
||||||
|
for (var query : queries) {
|
||||||
|
while (query.hasMore() && allResults.size() < 4096 ) {
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
allResults.addAll(buffer.copyData());
|
||||||
|
}
|
||||||
|
if (allResults.size() >= 4096)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
allResults.sort();
|
||||||
|
if (allResults.size() > 4096) {
|
||||||
|
allResults.subList(4096, allResults.size()).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
var docIds = new CombinedDocIdList(allResults.toArray());
|
||||||
|
var rankingContext = ResultRankingContext.create(indexReader, searchParameters);
|
||||||
|
|
||||||
|
System.out.println("Running warmup loop!");
|
||||||
|
int sum = 0;
|
||||||
|
|
||||||
|
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||||
|
int iter;
|
||||||
|
IndexSearchBudget budget = new IndexSearchBudget(10000);
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
sum += rankingService.rankResults(rankingContext, budget, docIds, false).size();
|
||||||
|
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Warmup complete after " + iter + " iters!");
|
||||||
|
|
||||||
|
runEndTime = Instant.now().plus(runTime);
|
||||||
|
Instant runStartTime = Instant.now();
|
||||||
|
int sum2 = 0;
|
||||||
|
List<Double> times = new ArrayList<>();
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
long start = System.nanoTime();
|
||||||
|
sum2 += rankingService.rankResults(rankingContext, budget, docIds, false).size();
|
||||||
|
long end = System.nanoTime();
|
||||||
|
times.add((end - start)/1_000_000.);
|
||||||
|
|
||||||
|
if ((iter % 100) == 0) {
|
||||||
|
if (Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||||
|
System.out.println("Best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||||
|
System.out.println("Warmup sum: " + sum);
|
||||||
|
System.out.println("Main sum: " + sum2);
|
||||||
|
System.out.println(docIds.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void runExecution(Path homeDir,
|
||||||
|
Path indexDir,
|
||||||
|
String rawQuery) throws IOException, SQLException, InterruptedException {
|
||||||
|
|
||||||
|
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||||
|
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||||
|
IndexResultRankingService rankingService = createIndexResultRankingService(indexDir, indexReader);
|
||||||
|
|
||||||
|
var queryLimits = RpcQueryLimits.newBuilder()
|
||||||
|
.setTimeoutMs(50)
|
||||||
|
.setResultsTotal(1000)
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setFetchSize(4096)
|
||||||
|
.build();
|
||||||
|
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||||
|
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||||
|
|
||||||
|
System.out.println("Running warmup loop!");
|
||||||
|
int sum = 0;
|
||||||
|
|
||||||
|
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||||
|
|
||||||
|
int iter;
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||||
|
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
|
||||||
|
execution.run();
|
||||||
|
sum += execution.itemsProcessed();
|
||||||
|
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Warmup complete after " + iter + " iters!");
|
||||||
|
|
||||||
|
runEndTime = Instant.now().plus(runTime);
|
||||||
|
Instant runStartTime = Instant.now();
|
||||||
|
int sum2 = 0;
|
||||||
|
List<Double> rates = new ArrayList<>();
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||||
|
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
|
||||||
|
long start = System.nanoTime();
|
||||||
|
execution.run();
|
||||||
|
long end = System.nanoTime();
|
||||||
|
sum2 += execution.itemsProcessed();
|
||||||
|
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
|
||||||
|
|
||||||
|
if ((iter % 100) == 0) {
|
||||||
|
if (Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||||
|
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||||
|
System.out.println("Warmup sum: " + sum);
|
||||||
|
System.out.println("Main sum: " + sum2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void runLookup(Path homeDir,
|
||||||
|
Path indexDir,
|
||||||
|
String rawQuery) throws IOException, SQLException
|
||||||
|
{
|
||||||
|
|
||||||
|
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||||
|
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||||
|
|
||||||
|
var queryLimits = RpcQueryLimits.newBuilder()
|
||||||
|
.setTimeoutMs(10_000)
|
||||||
|
.setResultsTotal(1000)
|
||||||
|
.setResultsByDomain(10)
|
||||||
|
.setFetchSize(4096)
|
||||||
|
.build();
|
||||||
|
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||||
|
|
||||||
|
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||||
|
|
||||||
|
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||||
|
|
||||||
|
|
||||||
|
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||||
|
|
||||||
|
LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||||
|
int sum1 = 0;
|
||||||
|
int iter;
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||||
|
|
||||||
|
for (var query : queries) {
|
||||||
|
while (query.hasMore()) {
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
sum1 += buffer.end;
|
||||||
|
buffer.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Warmup complete after " + iter + " iters with sum1 = " + sum1);
|
||||||
|
|
||||||
|
runEndTime = Instant.now().plus(runTime);
|
||||||
|
Instant runStartTime = Instant.now();
|
||||||
|
int sum2 = 0;
|
||||||
|
List<Double> times = new ArrayList<>();
|
||||||
|
for (iter = 0;; iter++) {
|
||||||
|
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||||
|
|
||||||
|
long start = System.nanoTime();
|
||||||
|
for (var query : queries) {
|
||||||
|
while (query.hasMore()) {
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
sum1 += buffer.end;
|
||||||
|
buffer.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
long end = System.nanoTime();
|
||||||
|
times.add((end - start)/1_000_000.);
|
||||||
|
|
||||||
|
if ((iter % 100) == 0) {
|
||||||
|
if (Instant.now().isAfter(runEndTime)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||||
|
System.out.println("Best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||||
|
System.out.println("Warmup sum: " + sum1);
|
||||||
|
System.out.println("Main sum: " + sum2);
|
||||||
|
}
|
||||||
|
}
|
@@ -3,8 +3,8 @@ package nu.marginalia.index;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
import nu.marginalia.index.positions.TermData;
|
|
||||||
import nu.marginalia.index.positions.PositionsFileReader;
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
import nu.marginalia.index.query.EmptyEntrySource;
|
import nu.marginalia.index.query.EmptyEntrySource;
|
||||||
import nu.marginalia.index.query.EntrySource;
|
import nu.marginalia.index.query.EntrySource;
|
||||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||||
@@ -161,12 +161,7 @@ public class FullReverseIndexReader {
|
|||||||
// Read the size and offset of the position data
|
// Read the size and offset of the position data
|
||||||
var offsets = reader.queryData(docIds, 1);
|
var offsets = reader.queryData(docIds, 1);
|
||||||
|
|
||||||
for (int i = 0; i < docIds.length; i++) {
|
return positionsFileReader.getTermData(arena, offsets);
|
||||||
if (offsets[i] == 0)
|
|
||||||
continue;
|
|
||||||
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
|
@@ -5,39 +5,84 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
|
||||||
|
/** Reads positions data from the positions file */
|
||||||
public class PositionsFileReader implements AutoCloseable {
|
public class PositionsFileReader implements AutoCloseable {
|
||||||
private final FileChannel positions;
|
|
||||||
|
// We use multiple file channels to avoid reads becoming serialized by the kernel.
|
||||||
|
// If we don't do this, multi-threaded reads become strictly slower than single-threaded reads
|
||||||
|
// (which is why AsynchronousFileChannel sucks).
|
||||||
|
|
||||||
|
// This is likely the best option apart from O_DIRECT or FFI:ing in libaio or io_uring.
|
||||||
|
|
||||||
|
private final FileChannel[] positions;
|
||||||
|
private final ForkJoinPool forkJoinPool;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
|
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
|
||||||
|
|
||||||
public PositionsFileReader(Path positionsFile) throws IOException {
|
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||||
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
this(positionsFile, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the positions for a term in the index, as pointed out by the encoded offset;
|
public PositionsFileReader(Path positionsFile, int nreaders) throws IOException {
|
||||||
* intermediate buffers are allocated from the provided arena allocator. */
|
positions = new FileChannel[nreaders];
|
||||||
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
|
for (int i = 0; i < positions.length; i++) {
|
||||||
int length = PositionCodec.decodeSize(sizeEncodedOffset);
|
positions[i] = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||||
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
|
|
||||||
|
|
||||||
var segment = arena.allocate(length);
|
|
||||||
var buffer = segment.asByteBuffer();
|
|
||||||
|
|
||||||
try {
|
|
||||||
positions.read(buffer, offset);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
}
|
||||||
|
forkJoinPool = new ForkJoinPool(nreaders);
|
||||||
return new TermData(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
positions.close();
|
for (FileChannel fc : positions) {
|
||||||
|
fc.close();
|
||||||
|
}
|
||||||
|
forkJoinPool.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the positions for a keywords in the index, as pointed out by the encoded offsets;
|
||||||
|
* intermediate buffers are allocated from the provided arena allocator. */
|
||||||
|
public TermData[] getTermData(Arena arena, long[] offsets) {
|
||||||
|
TermData[] ret = new TermData[offsets.length];
|
||||||
|
|
||||||
|
int tasks = 0;
|
||||||
|
for (long l : offsets) if (l != 0) tasks++;
|
||||||
|
|
||||||
|
CountDownLatch cl = new CountDownLatch(tasks);
|
||||||
|
|
||||||
|
for (int i = 0; i < offsets.length; i++) {
|
||||||
|
long encodedOffset = offsets[i];
|
||||||
|
if (encodedOffset == 0) continue;
|
||||||
|
|
||||||
|
int idx = i;
|
||||||
|
int length = PositionCodec.decodeSize(encodedOffset);
|
||||||
|
long offset = PositionCodec.decodeOffset(encodedOffset);
|
||||||
|
ByteBuffer buffer = arena.allocate(length).asByteBuffer();
|
||||||
|
|
||||||
|
forkJoinPool.execute(() -> {
|
||||||
|
try {
|
||||||
|
positions[idx % positions.length].read(buffer, offset);
|
||||||
|
ret[idx] = new TermData(buffer);
|
||||||
|
cl.countDown();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read positions file", ex);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
cl.await();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -11,7 +11,6 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
@@ -32,7 +31,6 @@ class PositionsFileReaderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void getTermData() throws IOException {
|
void getTermData() throws IOException {
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(8192);
|
|
||||||
long key1, key2, key3;
|
long key1, key2, key3;
|
||||||
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
|
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
|
||||||
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
|
key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
|
||||||
@@ -44,20 +42,19 @@ class PositionsFileReaderTest {
|
|||||||
System.out.println("key2: " + Long.toHexString(key2));
|
System.out.println("key2: " + Long.toHexString(key2));
|
||||||
System.out.println("key3: " + Long.toHexString(key3));
|
System.out.println("key3: " + Long.toHexString(key3));
|
||||||
|
|
||||||
try (Arena arena = Arena.ofConfined();
|
try (Arena arena = Arena.ofShared();
|
||||||
PositionsFileReader reader = new PositionsFileReader(file))
|
PositionsFileReader reader = new PositionsFileReader(file))
|
||||||
{
|
{
|
||||||
TermData data1 = reader.getTermData(arena, key1);
|
TermData[] data = reader.getTermData(arena, new long[] { key1, key2, key3 });
|
||||||
assertEquals(43, data1.flags());
|
|
||||||
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
|
|
||||||
|
|
||||||
TermData data2 = reader.getTermData(arena, key2);
|
assertEquals(43, data[0].flags());
|
||||||
assertEquals(51, data2.flags());
|
assertEquals(IntList.of( 1, 2, 3), data[0].positions().values());
|
||||||
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
|
|
||||||
|
|
||||||
TermData data3 = reader.getTermData(arena, key3);
|
assertEquals(51, data[1].flags());
|
||||||
assertEquals(61, data3.flags());
|
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data[1].positions().values());
|
||||||
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
|
|
||||||
|
assertEquals(61, data[2].flags());
|
||||||
|
assertEquals(IntList.of(3, 5, 7), data[2].positions().values());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -7,24 +7,13 @@ import io.grpc.stub.StreamObserver;
|
|||||||
import io.prometheus.client.Counter;
|
import io.prometheus.client.Counter;
|
||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|
||||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
|
||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
|
||||||
import nu.marginalia.index.results.IndexResultRankingService;
|
import nu.marginalia.index.results.IndexResultRankingService;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
import nu.marginalia.index.searchset.SearchSetsService;
|
import nu.marginalia.index.searchset.SearchSetsService;
|
||||||
import nu.marginalia.index.searchset.SmallSearchSet;
|
import nu.marginalia.index.searchset.SmallSearchSet;
|
||||||
@@ -35,14 +24,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
import org.slf4j.MarkerFactory;
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
import java.util.BitSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
|
||||||
import java.util.concurrent.Executor;
|
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexGrpcService
|
public class IndexGrpcService
|
||||||
@@ -88,23 +70,22 @@ public class IndexGrpcService
|
|||||||
private final StatefulIndex statefulIndex;
|
private final StatefulIndex statefulIndex;
|
||||||
private final SearchSetsService searchSetsService;
|
private final SearchSetsService searchSetsService;
|
||||||
|
|
||||||
private final IndexResultRankingService resultValuator;
|
private final IndexResultRankingService rankingService;
|
||||||
|
|
||||||
private final String nodeName;
|
private final String nodeName;
|
||||||
|
|
||||||
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
||||||
StatefulIndex statefulIndex,
|
StatefulIndex statefulIndex,
|
||||||
SearchSetsService searchSetsService,
|
SearchSetsService searchSetsService,
|
||||||
IndexResultRankingService resultValuator)
|
IndexResultRankingService rankingService)
|
||||||
{
|
{
|
||||||
var nodeId = serviceConfiguration.node();
|
var nodeId = serviceConfiguration.node();
|
||||||
this.nodeName = Integer.toString(nodeId);
|
this.nodeName = Integer.toString(nodeId);
|
||||||
this.statefulIndex = statefulIndex;
|
this.statefulIndex = statefulIndex;
|
||||||
this.searchSetsService = searchSetsService;
|
this.searchSetsService = searchSetsService;
|
||||||
this.resultValuator = resultValuator;
|
this.rankingService = rankingService;
|
||||||
}
|
}
|
||||||
|
|
||||||
// GRPC endpoint
|
// GRPC endpoint
|
||||||
@@ -121,7 +102,13 @@ public class IndexGrpcService
|
|||||||
.time(() -> {
|
.time(() -> {
|
||||||
// Perform the search
|
// Perform the search
|
||||||
try {
|
try {
|
||||||
return executeSearch(params);
|
|
||||||
|
if (!statefulIndex.isLoaded()) {
|
||||||
|
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new IndexQueryExecution(params, rankingService, statefulIndex.get()).run();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error in handling request", ex);
|
logger.error("Error in handling request", ex);
|
||||||
@@ -157,7 +144,12 @@ public class IndexGrpcService
|
|||||||
// exists for test access
|
// exists for test access
|
||||||
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
||||||
try {
|
try {
|
||||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
if (!statefulIndex.isLoaded()) {
|
||||||
|
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new IndexQueryExecution(new SearchParameters(specsSet, getSearchSet(specsSet)), rankingService, statefulIndex.get()).run();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error in handling request", ex);
|
logger.error("Error in handling request", ex);
|
||||||
@@ -183,262 +175,6 @@ public class IndexGrpcService
|
|||||||
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
|
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
|
||||||
}
|
}
|
||||||
|
|
||||||
// accessible for tests
|
|
||||||
public List<RpcDecoratedResultItem> executeSearch(SearchParameters params) throws Exception {
|
|
||||||
|
|
||||||
if (!statefulIndex.isLoaded()) {
|
|
||||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
|
||||||
return List.of();
|
|
||||||
}
|
|
||||||
|
|
||||||
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
|
|
||||||
params.compiledQuery,
|
|
||||||
params.compiledQueryIds);
|
|
||||||
|
|
||||||
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
|
|
||||||
|
|
||||||
List<RpcDecoratedResultItem> ret = queryExecution.run(params);
|
|
||||||
|
|
||||||
wmsa_index_query_exec_block_time
|
|
||||||
.labels(nodeName)
|
|
||||||
.set(queryExecution.getBlockTime() / 1000.);
|
|
||||||
wmsa_index_query_exec_stall_time
|
|
||||||
.labels(nodeName)
|
|
||||||
.set(queryExecution.getStallTime() / 1000.);
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This class is responsible for ranking the results and adding the best results to the
|
|
||||||
* resultHeap, which depending on the state of the indexLookup threads may or may not block
|
|
||||||
*/
|
|
||||||
private ResultRankingContext createRankingContext(RpcResultRankingParameters rankingParams,
|
|
||||||
CompiledQuery<String> compiledQuery,
|
|
||||||
CompiledQueryLong compiledQueryIds)
|
|
||||||
{
|
|
||||||
|
|
||||||
int[] full = new int[compiledQueryIds.size()];
|
|
||||||
int[] prio = new int[compiledQueryIds.size()];
|
|
||||||
|
|
||||||
BitSet ngramsMask = new BitSet(compiledQuery.size());
|
|
||||||
BitSet regularMask = new BitSet(compiledQuery.size());
|
|
||||||
|
|
||||||
var currentIndex = statefulIndex.get();
|
|
||||||
|
|
||||||
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
|
||||||
long id = compiledQueryIds.at(idx);
|
|
||||||
full[idx] = currentIndex.numHits(id);
|
|
||||||
prio[idx] = currentIndex.numHitsPrio(id);
|
|
||||||
|
|
||||||
if (compiledQuery.at(idx).contains("_")) {
|
|
||||||
ngramsMask.set(idx);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
regularMask.set(idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ResultRankingContext(currentIndex.totalDocCount(),
|
|
||||||
rankingParams,
|
|
||||||
ngramsMask,
|
|
||||||
regularMask,
|
|
||||||
new CqDataInt(full),
|
|
||||||
new CqDataInt(prio));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** This class is responsible for executing a search query. It uses a thread pool to
|
|
||||||
* execute the subqueries and their valuation in parallel. The results are then combined
|
|
||||||
* into a bounded priority queue, and finally the best results are returned.
|
|
||||||
*/
|
|
||||||
private class QueryExecution {
|
|
||||||
|
|
||||||
private static final Executor workerPool = Executors.newCachedThreadPool();
|
|
||||||
|
|
||||||
/** The queue where the results from the index lookup threads are placed,
|
|
||||||
* pending ranking by the result ranker threads */
|
|
||||||
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
|
|
||||||
= new ArrayBlockingQueue<>(64);
|
|
||||||
private final ResultPriorityQueue resultHeap;
|
|
||||||
|
|
||||||
private final ResultRankingContext resultRankingContext;
|
|
||||||
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
|
|
||||||
|
|
||||||
private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
|
|
||||||
private final AtomicLong blockTime = new AtomicLong(0);
|
|
||||||
|
|
||||||
private final AtomicLong stallTime = new AtomicLong(0);
|
|
||||||
|
|
||||||
public long getStallTime() {
|
|
||||||
return stallTime.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBlockTime() {
|
|
||||||
return blockTime.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
private QueryExecution(ResultRankingContext resultRankingContext, int maxResults) {
|
|
||||||
this.resultRankingContext = resultRankingContext;
|
|
||||||
this.resultHeap = new ResultPriorityQueue(maxResults);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Execute a search query */
|
|
||||||
public List<RpcDecoratedResultItem> run(SearchParameters parameters) throws Exception {
|
|
||||||
|
|
||||||
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
|
|
||||||
|
|
||||||
var currentIndex = statefulIndex.get();
|
|
||||||
for (var indexQuery : currentIndex.createQueries(terms, parameters.queryParams)) {
|
|
||||||
workerPool.execute(new IndexLookup(indexQuery, parameters.budget));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < indexValuationThreads; i++) {
|
|
||||||
workerPool.execute(new ResultRanker(parameters, resultRankingContext));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for all tasks to complete
|
|
||||||
awaitCompletion();
|
|
||||||
|
|
||||||
// Return the best results
|
|
||||||
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Wait for all tasks to complete */
|
|
||||||
private void awaitCompletion() throws InterruptedException {
|
|
||||||
synchronized (remainingValuationTasks) {
|
|
||||||
while (remainingValuationTasks.get() > 0) {
|
|
||||||
remainingValuationTasks.wait(20);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/** This class is responsible for executing a subquery and adding the results to the
|
|
||||||
* resultCandidateQueue, which depending on the state of the valuator threads may
|
|
||||||
* or may not block */
|
|
||||||
class IndexLookup implements Runnable {
|
|
||||||
private final IndexQuery query;
|
|
||||||
|
|
||||||
private final IndexSearchBudget budget;
|
|
||||||
|
|
||||||
IndexLookup(IndexQuery query,
|
|
||||||
IndexSearchBudget budget) {
|
|
||||||
this.query = query;
|
|
||||||
this.budget = budget;
|
|
||||||
|
|
||||||
remainingIndexTasks.incrementAndGet();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
executeSearch();
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("Error in index lookup", ex);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
synchronized (remainingIndexTasks) {
|
|
||||||
if (remainingIndexTasks.decrementAndGet() == 0) {
|
|
||||||
remainingIndexTasks.notifyAll();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void executeSearch() {
|
|
||||||
final LongArrayList results = new LongArrayList(16);
|
|
||||||
|
|
||||||
// These queries are different indices for one subquery
|
|
||||||
final LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
|
||||||
|
|
||||||
while (query.hasMore() && budget.hasTimeLeft())
|
|
||||||
{
|
|
||||||
buffer.reset();
|
|
||||||
query.getMoreResults(buffer);
|
|
||||||
|
|
||||||
for (int i = 0; i < buffer.end; i+=16) {
|
|
||||||
for (int j = 0; j < Math.min(buffer.end - i, 16); j++) {
|
|
||||||
results.add(buffer.data.get(i+j));
|
|
||||||
}
|
|
||||||
enqueueResults(new CombinedDocIdList(results));
|
|
||||||
results.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
buffer.dispose();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void enqueueResults(CombinedDocIdList resultIds) {
|
|
||||||
long remainingTime = budget.timeLeft();
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (!resultCandidateQueue.offer(resultIds)) {
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
resultCandidateQueue.offer(resultIds, remainingTime, TimeUnit.MILLISECONDS);
|
|
||||||
blockTime.addAndGet(System.currentTimeMillis() - start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (InterruptedException e) {
|
|
||||||
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
class ResultRanker implements Runnable {
|
|
||||||
private final SearchParameters parameters;
|
|
||||||
|
|
||||||
private final ResultRankingContext rankingContext;
|
|
||||||
|
|
||||||
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
|
|
||||||
this.parameters = parameters;
|
|
||||||
this.rankingContext = rankingContext;
|
|
||||||
|
|
||||||
remainingValuationTasks.incrementAndGet();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
while (parameters.budget.timeLeft() > 0 && execute());
|
|
||||||
}
|
|
||||||
catch (InterruptedException e) {
|
|
||||||
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
|
|
||||||
}
|
|
||||||
catch (Exception e) {
|
|
||||||
logger.error("Exception while ranking results", e);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
synchronized (remainingValuationTasks) {
|
|
||||||
if (remainingValuationTasks.decrementAndGet() == 0)
|
|
||||||
remainingValuationTasks.notifyAll();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean execute() throws Exception {
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
|
|
||||||
// Do a relatively short poll to ensure we terminate in a timely manner
|
|
||||||
// in the event all work is done
|
|
||||||
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
|
|
||||||
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
|
|
||||||
|
|
||||||
if (resultIds == null) {
|
|
||||||
// check if we are done and can terminate
|
|
||||||
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
stallTime.addAndGet(System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
resultHeap.addAll(
|
|
||||||
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true; // keep going
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
137
code/index/java/nu/marginalia/index/IndexQueryExecution.java
Normal file
137
code/index/java/nu/marginalia/index/IndexQueryExecution.java
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
|
import nu.marginalia.index.results.IndexResultRankingService;
|
||||||
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
|
||||||
|
/** Performs an index query */
|
||||||
|
public class IndexQueryExecution {
|
||||||
|
|
||||||
|
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
|
||||||
|
|
||||||
|
private static final ForkJoinPool lookupPool = new ForkJoinPool(indexValuationThreads);
|
||||||
|
private static final ForkJoinPool evaluationPool = new ForkJoinPool(indexValuationThreads);
|
||||||
|
|
||||||
|
private final IndexResultRankingService rankingService;
|
||||||
|
|
||||||
|
private final ResultRankingContext rankingContext;
|
||||||
|
private final List<IndexQuery> queries;
|
||||||
|
private final IndexSearchBudget budget;
|
||||||
|
private final ResultPriorityQueue resultHeap;
|
||||||
|
private final CountDownLatch executionCountdown;
|
||||||
|
|
||||||
|
private final int limitTotal;
|
||||||
|
private final int limitByDomain;
|
||||||
|
|
||||||
|
private int evaluationJobCounter;
|
||||||
|
|
||||||
|
public IndexQueryExecution(SearchParameters params,
|
||||||
|
IndexResultRankingService rankingService,
|
||||||
|
CombinedIndexReader currentIndex) {
|
||||||
|
this.rankingService = rankingService;
|
||||||
|
|
||||||
|
resultHeap = new ResultPriorityQueue(params.fetchSize);
|
||||||
|
|
||||||
|
budget = params.budget;
|
||||||
|
limitByDomain = params.limitByDomain;
|
||||||
|
limitTotal = params.limitTotal;
|
||||||
|
|
||||||
|
rankingContext = ResultRankingContext.create(currentIndex, params);
|
||||||
|
queries = currentIndex.createQueries(new SearchTerms(params.query, params.compiledQueryIds), params.queryParams);
|
||||||
|
executionCountdown = new CountDownLatch(queries.size());
|
||||||
|
|
||||||
|
evaluationJobCounter = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
|
||||||
|
// Spawn lookup tasks for each query
|
||||||
|
for (IndexQuery query : queries) {
|
||||||
|
lookupPool.execute(() -> lookup(query));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Await lookup task termination (this guarantees we're no longer creating new evaluation tasks)
|
||||||
|
executionCountdown.await();
|
||||||
|
|
||||||
|
// Await evaluation task termination
|
||||||
|
synchronized (IndexQueryExecution.this) {
|
||||||
|
while (evaluationJobCounter > 0 && budget.hasTimeLeft()) {
|
||||||
|
IndexQueryExecution.this.wait(budget.timeLeft());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final result selection
|
||||||
|
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void lookup(IndexQuery query) {
|
||||||
|
final LongQueryBuffer buffer = new LongQueryBuffer(1024);
|
||||||
|
try {
|
||||||
|
while (query.hasMore() && budget.hasTimeLeft()) {
|
||||||
|
|
||||||
|
buffer.reset();
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
if (buffer.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
CombinedDocIdList docIds = new CombinedDocIdList(buffer);
|
||||||
|
|
||||||
|
boolean stealWork = false;
|
||||||
|
synchronized (IndexQueryExecution.this) {
|
||||||
|
// Hold off on spawning new evaluation jobs if we have too many queued
|
||||||
|
// to avoid backpressure, instead steal work into the lookup thread
|
||||||
|
// in this scenario
|
||||||
|
|
||||||
|
if (evaluationJobCounter > indexValuationThreads * 8) {
|
||||||
|
stealWork = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
evaluationJobCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stealWork) {
|
||||||
|
resultHeap.addAll(rankingService.rankResults(rankingContext, budget, docIds, false));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Spawn an evaluation task
|
||||||
|
evaluationPool.execute(() -> evaluate(docIds));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
buffer.dispose();
|
||||||
|
executionCountdown.countDown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void evaluate(CombinedDocIdList docIds) {
|
||||||
|
try {
|
||||||
|
if (!budget.hasTimeLeft())
|
||||||
|
return;
|
||||||
|
resultHeap.addAll(rankingService.rankResults(rankingContext, budget, docIds, false));
|
||||||
|
} finally {
|
||||||
|
synchronized (IndexQueryExecution.this) {
|
||||||
|
if (--evaluationJobCounter == 0) {
|
||||||
|
IndexQueryExecution.this.notifyAll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int itemsProcessed() {
|
||||||
|
return resultHeap.getItemsProcessed();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,116 +1,59 @@
|
|||||||
package nu.marginalia.index;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import com.google.common.collect.MinMaxPriorityQueue;
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.Collection;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
/** A priority queue for search results. This class is not thread-safe,
|
/** A priority queue for search results. This class is not thread-safe,
|
||||||
* in general, except for concurrent use of the addAll method.
|
* in general, except for concurrent use of the addAll method.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* The class implements a subset of the Collection interface, and
|
|
||||||
* is intended to be used as a priority queue for search results,
|
|
||||||
* with a maximum size.
|
|
||||||
* <p></p>
|
|
||||||
* Since the expected use case is to add a large number of items
|
* Since the expected use case is to add a large number of items
|
||||||
* and then iterate over the items, the class is optimized for
|
* and then iterate over the items, the class is optimized for
|
||||||
* this scenario, and does not implement other mutating methods
|
* this scenario, and does not implement other mutating methods
|
||||||
* than addAll().
|
* than addAll().
|
||||||
*/
|
*/
|
||||||
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
public class ResultPriorityQueue implements Iterable<SearchResultItem> {
|
||||||
Collection<SearchResultItem> {
|
|
||||||
private final int limit;
|
|
||||||
private final ArrayList<SearchResultItem> backingList = new ArrayList<>();
|
|
||||||
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
|
private final LongOpenHashSet idsInSet = new LongOpenHashSet();
|
||||||
|
private final MinMaxPriorityQueue<SearchResultItem> queue;
|
||||||
|
|
||||||
|
private int itemsProcessed = 0;
|
||||||
|
|
||||||
public ResultPriorityQueue(int limit) {
|
public ResultPriorityQueue(int limit) {
|
||||||
this.limit = limit;
|
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<SearchResultItem> iterator() {
|
public Iterator<SearchResultItem> iterator() {
|
||||||
return backingList.iterator();
|
return queue.iterator();
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
@Override
|
|
||||||
public Object[] toArray() {
|
|
||||||
return backingList.toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
@Override
|
|
||||||
public <T> T[] toArray(@NotNull T[] a) {
|
|
||||||
return backingList.toArray(a);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean add(SearchResultItem searchResultItem) {
|
|
||||||
throw new UnsupportedOperationException("Use addAll instead");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean remove(Object o) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean containsAll(@NotNull Collection<?> c) {
|
|
||||||
return idsInSet.containsAll(c);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Adds all items to the queue, and returns true if any items were added.
|
/** Adds all items to the queue, and returns true if any items were added.
|
||||||
* This is a thread-safe operation.
|
* This is a thread-safe operation.
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
|
public synchronized boolean addAll(@NotNull Collection<? extends SearchResultItem> items) {
|
||||||
boolean itemsAdded = false;
|
itemsProcessed+=items.size();
|
||||||
|
|
||||||
for (var item : items) {
|
for (var item : items) {
|
||||||
if (idsInSet.add(item.getDocumentId())) {
|
if (idsInSet.add(item.getDocumentId())) {
|
||||||
backingList.add(item);
|
queue.add(item);
|
||||||
itemsAdded = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!itemsAdded) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
backingList.sort(Comparator.naturalOrder());
|
|
||||||
if (backingList.size() > limit) {
|
|
||||||
backingList.subList(limit, backingList.size()).clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean removeAll(@NotNull Collection<?> c) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean retainAll(@NotNull Collection<?> c) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
backingList.clear();
|
|
||||||
idsInSet.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return backingList.size();
|
return queue.size();
|
||||||
|
}
|
||||||
|
public int getItemsProcessed() {
|
||||||
|
return itemsProcessed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return backingList.isEmpty();
|
return queue.isEmpty();
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean contains(Object o) {
|
|
||||||
return backingList.contains(o);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -205,14 +205,19 @@ public class CombinedIndexReader {
|
|||||||
return forwardIndexReader.getDocumentSize(docId);
|
return forwardIndexReader.getDocumentSize(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Retrieves the document spans for the specified document */
|
/** Retrieves the document spans for the specified documents */
|
||||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
public DocumentSpans[] getDocumentSpans(Arena arena, CombinedDocIdList docIds) {
|
||||||
return forwardIndexReader.getDocumentSpans(arena, docId);
|
long[] decodedIDs = docIds.array();
|
||||||
|
for (int i = 0; i < decodedIDs.length; i++) {
|
||||||
|
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return forwardIndexReader.getDocumentSpans(arena, decodedIDs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Close the indexes (this is not done immediately)
|
/** Close the indexes (this is not done immediately)
|
||||||
* */
|
* */
|
||||||
public void close() throws InterruptedException {
|
public void close() {
|
||||||
/* Delay the invocation of close method to allow for a clean shutdown of the service.
|
/* Delay the invocation of close method to allow for a clean shutdown of the service.
|
||||||
*
|
*
|
||||||
* This is especially important when using Unsafe-based LongArrays, since we have
|
* This is especially important when using Unsafe-based LongArrays, since we have
|
||||||
@@ -227,7 +232,7 @@ public class CombinedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void delayedCall(Runnable call, Duration delay) throws InterruptedException {
|
private void delayedCall(Runnable call, Duration delay) {
|
||||||
Thread.ofPlatform().start(() -> {
|
Thread.ofPlatform().start(() -> {
|
||||||
try {
|
try {
|
||||||
TimeUnit.SECONDS.sleep(delay.toSeconds());
|
TimeUnit.SECONDS.sleep(delay.toSeconds());
|
||||||
@@ -248,12 +253,13 @@ public class CombinedIndexReader {
|
|||||||
class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||||
private final QueryParams params;
|
private final QueryParams params;
|
||||||
private final ForwardIndexReader forwardIndexReader;
|
private final ForwardIndexReader forwardIndexReader;
|
||||||
|
private final boolean imposesMetaConstraint;
|
||||||
public ParamMatchingQueryFilter(QueryParams params,
|
public ParamMatchingQueryFilter(QueryParams params,
|
||||||
ForwardIndexReader forwardIndexReader)
|
ForwardIndexReader forwardIndexReader)
|
||||||
{
|
{
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.forwardIndexReader = forwardIndexReader;
|
this.forwardIndexReader = forwardIndexReader;
|
||||||
|
this.imposesMetaConstraint = params.imposesDomainMetadataConstraint();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -261,12 +267,16 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
long docId = UrlIdCodec.removeRank(combinedId);
|
long docId = UrlIdCodec.removeRank(combinedId);
|
||||||
int domainId = UrlIdCodec.getDomainId(docId);
|
int domainId = UrlIdCodec.getDomainId(docId);
|
||||||
|
|
||||||
long meta = forwardIndexReader.getDocMeta(docId);
|
if (!validateDomain(domainId)) {
|
||||||
|
|
||||||
if (!validateDomain(domainId, meta)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!imposesMetaConstraint) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
long meta = forwardIndexReader.getDocMeta(docId);
|
||||||
|
|
||||||
if (!validateQuality(meta)) {
|
if (!validateQuality(meta)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -286,8 +296,8 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateDomain(int domainId, long meta) {
|
private boolean validateDomain(int domainId) {
|
||||||
return params.searchSet().contains(domainId, meta);
|
return params.searchSet().contains(domainId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateQuality(long meta) {
|
private boolean validateQuality(long meta) {
|
||||||
|
@@ -35,6 +35,13 @@ public class StatefulIndex {
|
|||||||
this.eventLog = eventLog;
|
this.eventLog = eventLog;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** For use in testing only */
|
||||||
|
public StatefulIndex(CombinedIndexReader combinedIndexReader) {
|
||||||
|
this.combinedIndexReader = combinedIndexReader;
|
||||||
|
this.servicesFactory = null;
|
||||||
|
this.eventLog = null;
|
||||||
|
}
|
||||||
|
|
||||||
public void init() {
|
public void init() {
|
||||||
Lock lock = indexReplacementLock.writeLock();
|
Lock lock = indexReplacementLock.writeLock();
|
||||||
|
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.index.model;
|
package nu.marginalia.index.model;
|
||||||
|
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
@@ -41,6 +42,13 @@ public final class QueryParams {
|
|||||||
this.queryStrategy = queryStrategy;
|
this.queryStrategy = queryStrategy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean imposesDomainMetadataConstraint() {
|
||||||
|
return qualityLimit.type() != SpecificationLimitType.NONE
|
||||||
|
|| year.type() != SpecificationLimitType.NONE
|
||||||
|
|| size.type() != SpecificationLimitType.NONE
|
||||||
|
|| rank.type() != SpecificationLimitType.NONE;
|
||||||
|
}
|
||||||
|
|
||||||
public SpecificationLimit qualityLimit() {
|
public SpecificationLimit qualityLimit() {
|
||||||
return qualityLimit;
|
return qualityLimit;
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,106 @@
|
|||||||
|
package nu.marginalia.index.model;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
public class ResultRankingContext {
|
||||||
|
private final int docCount;
|
||||||
|
public final RpcResultRankingParameters params;
|
||||||
|
public final SearchQuery searchQuery;
|
||||||
|
public final QueryParams queryParams;
|
||||||
|
|
||||||
|
public final CompiledQuery<String> compiledQuery;
|
||||||
|
public final CompiledQueryLong compiledQueryIds;
|
||||||
|
|
||||||
|
public final BitSet regularMask;
|
||||||
|
public final BitSet ngramsMask;
|
||||||
|
|
||||||
|
/** CqDataInt associated with frequency information of the terms in the query
|
||||||
|
* in the full index. The dataset is indexed by the compiled query. */
|
||||||
|
public final CqDataInt fullCounts;
|
||||||
|
|
||||||
|
/** CqDataInt associated with frequency information of the terms in the query
|
||||||
|
* in the full index. The dataset is indexed by the compiled query. */
|
||||||
|
public final CqDataInt priorityCounts;
|
||||||
|
|
||||||
|
public static ResultRankingContext create(CombinedIndexReader currentIndex, SearchParameters searchParameters) {
|
||||||
|
|
||||||
|
var compiledQueryIds = searchParameters.compiledQueryIds;
|
||||||
|
var compiledQuery = searchParameters.compiledQuery;
|
||||||
|
|
||||||
|
int[] full = new int[compiledQueryIds.size()];
|
||||||
|
int[] prio = new int[compiledQueryIds.size()];
|
||||||
|
|
||||||
|
BitSet ngramsMask = new BitSet(compiledQuery.size());
|
||||||
|
BitSet regularMask = new BitSet(compiledQuery.size());
|
||||||
|
|
||||||
|
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
||||||
|
long id = compiledQueryIds.at(idx);
|
||||||
|
full[idx] = currentIndex.numHits(id);
|
||||||
|
prio[idx] = currentIndex.numHitsPrio(id);
|
||||||
|
|
||||||
|
if (compiledQuery.at(idx).contains("_")) {
|
||||||
|
ngramsMask.set(idx);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
regularMask.set(idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ResultRankingContext(currentIndex.totalDocCount(),
|
||||||
|
searchParameters,
|
||||||
|
compiledQuery,
|
||||||
|
compiledQueryIds,
|
||||||
|
ngramsMask,
|
||||||
|
regularMask,
|
||||||
|
new CqDataInt(full),
|
||||||
|
new CqDataInt(prio));
|
||||||
|
}
|
||||||
|
|
||||||
|
public ResultRankingContext(int docCount,
|
||||||
|
SearchParameters searchParameters,
|
||||||
|
CompiledQuery<String> compiledQuery,
|
||||||
|
CompiledQueryLong compiledQueryIds,
|
||||||
|
BitSet ngramsMask,
|
||||||
|
BitSet regularMask,
|
||||||
|
CqDataInt fullCounts,
|
||||||
|
CqDataInt prioCounts)
|
||||||
|
{
|
||||||
|
this.docCount = docCount;
|
||||||
|
|
||||||
|
this.searchQuery = searchParameters.query;
|
||||||
|
this.params = searchParameters.rankingParams;
|
||||||
|
this.queryParams = searchParameters.queryParams;
|
||||||
|
|
||||||
|
this.compiledQuery = compiledQuery;
|
||||||
|
this.compiledQueryIds = compiledQueryIds;
|
||||||
|
|
||||||
|
this.ngramsMask = ngramsMask;
|
||||||
|
this.regularMask = regularMask;
|
||||||
|
|
||||||
|
this.fullCounts = fullCounts;
|
||||||
|
this.priorityCounts = prioCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int termFreqDocCount() {
|
||||||
|
return docCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "ResultRankingContext{" +
|
||||||
|
"docCount=" + docCount +
|
||||||
|
", params=" + params +
|
||||||
|
", regularMask=" + regularMask +
|
||||||
|
", ngramsMask=" + ngramsMask +
|
||||||
|
", fullCounts=" + fullCounts +
|
||||||
|
", priorityCounts=" + priorityCounts +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
@@ -2,7 +2,7 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@@ -12,13 +12,15 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
|||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
|
import nu.marginalia.index.ResultPriorityQueue;
|
||||||
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
import nu.marginalia.index.model.SearchTermsUtil;
|
||||||
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
@@ -32,7 +34,10 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.lang.foreign.Arena;
|
import java.lang.foreign.Arena;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexResultRankingService {
|
public class IndexResultRankingService {
|
||||||
@@ -52,15 +57,16 @@ public class IndexResultRankingService {
|
|||||||
this.domainRankingOverrides = domainRankingOverrides;
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
public List<SearchResultItem> rankResults(
|
||||||
boolean exportDebugData,
|
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
CombinedDocIdList resultIds)
|
IndexSearchBudget budget,
|
||||||
|
CombinedDocIdList resultIds,
|
||||||
|
boolean exportDebugData)
|
||||||
{
|
{
|
||||||
if (resultIds.isEmpty())
|
if (resultIds.isEmpty())
|
||||||
return List.of();
|
return List.of();
|
||||||
|
|
||||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext, params);
|
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
|
||||||
|
|
||||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||||
|
|
||||||
@@ -68,13 +74,11 @@ public class IndexResultRankingService {
|
|||||||
// this may change during the calculation, but we don't want to switch over mid-calculation
|
// this may change during the calculation, but we don't want to switch over mid-calculation
|
||||||
final CombinedIndexReader currentIndex = statefulIndex.get();
|
final CombinedIndexReader currentIndex = statefulIndex.get();
|
||||||
|
|
||||||
final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
|
final QuerySearchTerms searchTerms = getSearchTerms(rankingContext.compiledQuery, rankingContext.searchQuery);
|
||||||
final int termCount = searchTerms.termIdsAll.size();
|
final int termCount = searchTerms.termIdsAll.size();
|
||||||
|
|
||||||
// We use an arena for the position data to avoid gc pressure
|
// We use an arena for the position and spans data to limit gc pressure
|
||||||
// from the gamma coded sequences, which can be large and have a lifetime
|
try (var arena = Arena.ofShared()) {
|
||||||
// that matches the try block here
|
|
||||||
try (var arena = Arena.ofConfined()) {
|
|
||||||
|
|
||||||
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
|
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
|
||||||
for (int ti = 0; ti < termCount; ti++) {
|
for (int ti = 0; ti < termCount; ti++) {
|
||||||
@@ -87,11 +91,12 @@ public class IndexResultRankingService {
|
|||||||
|
|
||||||
long[] flags = new long[termCount];
|
long[] flags = new long[termCount];
|
||||||
CodedSequence[] positions = new CodedSequence[termCount];
|
CodedSequence[] positions = new CodedSequence[termCount];
|
||||||
|
DocumentSpans[] documentSpans = currentIndex.getDocumentSpans(arena, resultIds);
|
||||||
|
|
||||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||||
// term data arrays as well
|
// term data arrays as well
|
||||||
|
|
||||||
for (int i = 0; i < resultIds.size(); i++) {
|
for (int i = 0; i < resultIds.size() && budget.hasTimeLeft(); i++) {
|
||||||
|
|
||||||
// Prepare term-level data for the document
|
// Prepare term-level data for the document
|
||||||
for (int ti = 0; ti < flags.length; ti++) {
|
for (int ti = 0; ti < flags.length; ti++) {
|
||||||
@@ -109,14 +114,15 @@ public class IndexResultRankingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!exportDebugData) {
|
if (!exportDebugData) {
|
||||||
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
|
var score = resultRanker.calculateScore(null, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
results.add(score);
|
results.add(score);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
var rankingFactors = new DebugRankingFactors();
|
var rankingFactors = new DebugRankingFactors();
|
||||||
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
|
var score = resultRanker.calculateScore( rankingFactors, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
|
||||||
|
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
score.debugRankingFactors = rankingFactors;
|
score.debugRankingFactors = rankingFactors;
|
||||||
results.add(score);
|
results.add(score);
|
||||||
@@ -129,19 +135,20 @@ public class IndexResultRankingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
|
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
|
||||||
|
int limitTotal,
|
||||||
ResultRankingContext resultRankingContext,
|
ResultRankingContext resultRankingContext,
|
||||||
Collection<SearchResultItem> results) throws SQLException {
|
ResultPriorityQueue results) throws SQLException {
|
||||||
|
|
||||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
|
||||||
|
|
||||||
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||||
TLongList idsList = new TLongArrayList(params.limitTotal);
|
TLongList idsList = new TLongArrayList(limitTotal);
|
||||||
|
|
||||||
for (var item : results) {
|
for (var item : results) {
|
||||||
if (domainCountFilter.test(item)) {
|
if (domainCountFilter.test(item)) {
|
||||||
|
|
||||||
if (resultsList.size() < params.limitTotal) {
|
if (resultsList.size() < limitTotal) {
|
||||||
resultsList.add(item);
|
resultsList.add(item);
|
||||||
idsList.add(item.getDocumentId());
|
idsList.add(item.getDocumentId());
|
||||||
}
|
}
|
||||||
@@ -159,18 +166,18 @@ public class IndexResultRankingService {
|
|||||||
// for the selected results, as this would be comically expensive to do for all the results we
|
// for the selected results, as this would be comically expensive to do for all the results we
|
||||||
// discard along the way
|
// discard along the way
|
||||||
|
|
||||||
if (params.rankingParams.getExportDebugData()) {
|
if (resultRankingContext.params.getExportDebugData()) {
|
||||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||||
for (var item : resultsList) {
|
for (var item : resultsList) {
|
||||||
combinedIdsList.add(item.combinedId);
|
combinedIdsList.add(item.combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
resultsList.clear();
|
resultsList.clear();
|
||||||
|
IndexSearchBudget budget = new IndexSearchBudget(10000);
|
||||||
resultsList.addAll(this.rankResults(
|
resultsList.addAll(this.rankResults(
|
||||||
params,
|
|
||||||
true,
|
|
||||||
resultRankingContext,
|
resultRankingContext,
|
||||||
new CombinedDocIdList(combinedIdsList))
|
budget, new CombinedDocIdList(combinedIdsList),
|
||||||
|
true)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,7 +254,7 @@ public class IndexResultRankingService {
|
|||||||
|
|
||||||
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
||||||
|
|
||||||
CqDataLong termIds = params.compiledQueryIds.data;;
|
CqDataLong termIds = resultRankingContext.compiledQueryIds.data;
|
||||||
|
|
||||||
for (var entry : debugFactors.getTermFactors()) {
|
for (var entry : debugFactors.getTermFactors()) {
|
||||||
String term = "[ERROR IN LOOKUP]";
|
String term = "[ERROR IN LOOKUP]";
|
||||||
@@ -255,7 +262,7 @@ public class IndexResultRankingService {
|
|||||||
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
||||||
for (int i = 0; i < termIds.size(); i++) {
|
for (int i = 0; i < termIds.size(); i++) {
|
||||||
if (termIds.get(i) == entry.termId()) {
|
if (termIds.get(i) == entry.termId()) {
|
||||||
term = params.compiledQuery.at(i);
|
term = resultRankingContext.compiledQuery.at(i);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -6,14 +6,13 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
|||||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
@@ -28,7 +27,6 @@ import nu.marginalia.sequence.CodedSequence;
|
|||||||
import nu.marginalia.sequence.SequenceOperations;
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.lang.foreign.Arena;
|
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||||
@@ -47,24 +45,23 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||||
DomainRankingOverrides domainRankingOverrides,
|
DomainRankingOverrides domainRankingOverrides,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext)
|
||||||
SearchParameters params)
|
|
||||||
{
|
{
|
||||||
this.index = statefulIndex.get();
|
this.index = statefulIndex.get();
|
||||||
this.domainRankingOverrides = domainRankingOverrides;
|
this.domainRankingOverrides = domainRankingOverrides;
|
||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
|
|
||||||
this.queryParams = params.queryParams;
|
this.queryParams = rankingContext.queryParams;
|
||||||
this.compiledQuery = params.compiledQuery;
|
this.compiledQuery = rankingContext.compiledQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public SearchResultItem calculateScore(Arena arena,
|
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
|
||||||
@Nullable DebugRankingFactors debugRankingFactors,
|
|
||||||
long combinedId,
|
long combinedId,
|
||||||
QuerySearchTerms searchTerms,
|
QuerySearchTerms searchTerms,
|
||||||
long[] wordFlags,
|
long[] wordFlags,
|
||||||
CodedSequence[] positions)
|
CodedSequence[] positions,
|
||||||
|
DocumentSpans spans)
|
||||||
{
|
{
|
||||||
|
|
||||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||||
@@ -92,8 +89,6 @@ public class IndexResultScoreCalculator {
|
|||||||
int docSize = index.getDocumentSize(docId);
|
int docSize = index.getDocumentSize(docId);
|
||||||
if (docSize <= 0) docSize = 5000;
|
if (docSize <= 0) docSize = 5000;
|
||||||
|
|
||||||
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
|
||||||
|
|
||||||
if (debugRankingFactors != null) {
|
if (debugRankingFactors != null) {
|
||||||
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
|
debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId));
|
||||||
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
|
debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId));
|
||||||
@@ -235,7 +230,7 @@ public class IndexResultScoreCalculator {
|
|||||||
long result = 0;
|
long result = 0;
|
||||||
int bit = 0;
|
int bit = 0;
|
||||||
|
|
||||||
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions).intIterator();
|
IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions, 64).intIterator();
|
||||||
|
|
||||||
while (intersection.hasNext() && bit < 64) {
|
while (intersection.hasNext() && bit < 64) {
|
||||||
bit = (int) (Math.sqrt(intersection.nextInt()));
|
bit = (int) (Math.sqrt(intersection.nextInt()));
|
||||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.index.results;
|
|||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.index.model.ResultRankingContext;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@@ -58,6 +58,7 @@ public class PhraseConstraintGroupList {
|
|||||||
private final int[] offsets;
|
private final int[] offsets;
|
||||||
private final BitSet present;
|
private final BitSet present;
|
||||||
private final BitSet termIdsMask;
|
private final BitSet termIdsMask;
|
||||||
|
private final int presentCardinality;
|
||||||
|
|
||||||
public final int size;
|
public final int size;
|
||||||
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
|
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
|
||||||
@@ -85,6 +86,8 @@ public class PhraseConstraintGroupList {
|
|||||||
termIdsMask.set(idx);
|
termIdsMask.set(idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
presentCardinality = present.cardinality();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns true if the term with index termIdx in the query is in the group */
|
/** Returns true if the term with index termIdx in the query is in the group */
|
||||||
@@ -93,7 +96,7 @@ public class PhraseConstraintGroupList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean test(CodedSequence[] positions) {
|
public boolean test(CodedSequence[] positions) {
|
||||||
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
IntIterator[] sequences = new IntIterator[presentCardinality];
|
||||||
|
|
||||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||||
if (!present.get(oi)) {
|
if (!present.get(oi)) {
|
||||||
@@ -120,7 +123,7 @@ public class PhraseConstraintGroupList {
|
|||||||
|
|
||||||
|
|
||||||
public IntList findIntersections(IntList[] positions) {
|
public IntList findIntersections(IntList[] positions) {
|
||||||
IntList[] sequences = new IntList[present.cardinality()];
|
IntList[] sequences = new IntList[presentCardinality];
|
||||||
int[] iterOffsets = new int[sequences.length];
|
int[] iterOffsets = new int[sequences.length];
|
||||||
|
|
||||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||||
@@ -144,12 +147,41 @@ public class PhraseConstraintGroupList {
|
|||||||
iterOffsets[si - 1] = -oi;
|
iterOffsets[si - 1] = -oi;
|
||||||
}
|
}
|
||||||
|
|
||||||
return SequenceOperations.findIntersections(sequences, iterOffsets);
|
return SequenceOperations.findIntersections(sequences, iterOffsets, Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public IntList findIntersections(IntList[] positions, int n) {
|
||||||
|
IntList[] sequences = new IntList[presentCardinality];
|
||||||
|
int[] iterOffsets = new int[sequences.length];
|
||||||
|
|
||||||
|
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||||
|
if (!present.get(oi)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int offset = offsets[oi];
|
||||||
|
if (offset < 0)
|
||||||
|
return IntList.of();
|
||||||
|
|
||||||
|
// Create iterators that are offset by their relative position in the
|
||||||
|
// sequence. This is done by subtracting the index from the offset,
|
||||||
|
// so that when we intersect them, an overlap means that the terms are
|
||||||
|
// in the correct order. Note the offset is negative!
|
||||||
|
|
||||||
|
var posForTerm = positions[offset];
|
||||||
|
if (posForTerm == null) {
|
||||||
|
return IntList.of();
|
||||||
|
}
|
||||||
|
sequences[si++] = posForTerm;
|
||||||
|
iterOffsets[si - 1] = -oi;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SequenceOperations.findIntersections(sequences, iterOffsets, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int minDistance(IntList[] positions) {
|
public int minDistance(IntList[] positions) {
|
||||||
List<IntList> sequences = new ArrayList<>(present.cardinality());
|
List<IntList> sequences = new ArrayList<>(presentCardinality);
|
||||||
IntList iterOffsets = new IntArrayList(present.cardinality());
|
IntList iterOffsets = new IntArrayList(presentCardinality);
|
||||||
|
|
||||||
for (int oi = 0; oi < offsets.length; oi++) {
|
for (int oi = 0; oi < offsets.length; oi++) {
|
||||||
if (!present.get(oi)) {
|
if (!present.get(oi)) {
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.index.results.model.ids;
|
package nu.marginalia.index.results.model.ids;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@@ -17,7 +18,9 @@ public final class CombinedDocIdList {
|
|||||||
public CombinedDocIdList(long... data) {
|
public CombinedDocIdList(long... data) {
|
||||||
this.data = Arrays.copyOf(data, data.length);
|
this.data = Arrays.copyOf(data, data.length);
|
||||||
}
|
}
|
||||||
|
public CombinedDocIdList(LongQueryBuffer buffer) {
|
||||||
|
this.data = buffer.copyData();
|
||||||
|
}
|
||||||
public CombinedDocIdList(LongArrayList data) {
|
public CombinedDocIdList(LongArrayList data) {
|
||||||
this.data = data.toLongArray();
|
this.data = data.toLongArray();
|
||||||
}
|
}
|
||||||
|
@@ -59,7 +59,7 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int domainId, long documentMetadata) {
|
public boolean contains(int domainId) {
|
||||||
|
|
||||||
// This is the main check
|
// This is the main check
|
||||||
if (set.contains(domainId) || set.isEmpty()) {
|
if (set.contains(domainId) || set.isEmpty()) {
|
||||||
|
@@ -7,6 +7,6 @@ public interface SearchSet {
|
|||||||
* or if the documentMetadata vibes with the set
|
* or if the documentMetadata vibes with the set
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
boolean contains(int domainId, long documentMetadata);
|
boolean contains(int domainId);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -2,7 +2,7 @@ package nu.marginalia.index.searchset;
|
|||||||
|
|
||||||
public class SearchSetAny implements SearchSet {
|
public class SearchSetAny implements SearchSet {
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int domainId, long meta) {
|
public boolean contains(int domainId) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -14,7 +14,7 @@ public class SmallSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int domainId, long meta) {
|
public boolean contains(int domainId) {
|
||||||
return entries.contains(domainId);
|
return entries.contains(domainId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,5 +10,5 @@ public class IndexSearchBudget {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
||||||
public long timeLeft() { return timeout - System.currentTimeMillis(); }
|
public long timeLeft() { return Math.max(0, timeout - System.currentTimeMillis()); }
|
||||||
}
|
}
|
||||||
|
@@ -25,10 +25,10 @@ class RankingSearchSetTest {
|
|||||||
set.write();
|
set.write();
|
||||||
|
|
||||||
RankingSearchSet set2 = new RankingSearchSet("ACADEMIA", p);
|
RankingSearchSet set2 = new RankingSearchSet("ACADEMIA", p);
|
||||||
assertTrue(set2.contains(1, 0));
|
assertTrue(set2.contains(1));
|
||||||
assertTrue(set2.contains(5, 0));
|
assertTrue(set2.contains(5));
|
||||||
assertTrue(set2.contains(7, 0));
|
assertTrue(set2.contains(7));
|
||||||
assertTrue(set2.contains(9, 0));
|
assertTrue(set2.contains(9));
|
||||||
|
|
||||||
Files.delete(p);
|
Files.delete(p);
|
||||||
|
|
||||||
|
@@ -56,7 +56,7 @@ public class SequenceOperations {
|
|||||||
* <p></p>
|
* <p></p>
|
||||||
*/
|
*/
|
||||||
public static IntList findIntersections(IntList... positions) {
|
public static IntList findIntersections(IntList... positions) {
|
||||||
return findIntersections(positions, new int[positions.length]);
|
return findIntersections(positions, new int[positions.length], Integer.MAX_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Find any intersections between the given positions lists, and return the list of intersections.
|
/** Find any intersections between the given positions lists, and return the list of intersections.
|
||||||
@@ -67,53 +67,80 @@ public class SequenceOperations {
|
|||||||
* @param positions the positions lists to compare - each list must be sorted in ascending order
|
* @param positions the positions lists to compare - each list must be sorted in ascending order
|
||||||
* and contain unique values.
|
* and contain unique values.
|
||||||
* @param offsets constant offsets to apply to each position
|
* @param offsets constant offsets to apply to each position
|
||||||
|
* @param n maximum number of intersections we're interested in. The algorithm does not guarantee
|
||||||
|
* the return value will have a smaller size than this if it is cheaper to return back e.g.
|
||||||
|
* an input list.
|
||||||
* */
|
* */
|
||||||
public static IntList findIntersections(IntList[] positions, int[] offsets) {
|
public static IntList findIntersections(IntList[] positions, int[] offsets, int n) {
|
||||||
|
|
||||||
if (positions.length < 1)
|
// Trivial cases
|
||||||
|
if (positions.length < 1) { // n = 0
|
||||||
return IntList.of();
|
return IntList.of();
|
||||||
|
}
|
||||||
|
// else if (positions.length == 1) { // n = 1
|
||||||
|
// if (offsets[0] == 0) { // with zero offset, we'll just return the input back
|
||||||
|
// return positions[0];
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // Calculate an offset input array
|
||||||
|
// IntList ret = new IntArrayList(positions[0].size());
|
||||||
|
// for (int i = 0; i < positions[0].size() && i < n; i++) {
|
||||||
|
// ret.add(positions[0].getInt(i) + offsets[0]);
|
||||||
|
// }
|
||||||
|
// return ret;
|
||||||
|
// }
|
||||||
|
|
||||||
int[] indexes = new int[positions.length];
|
int[] indexes = new int[positions.length];
|
||||||
// Initialize values and find the maximum value
|
// Initialize values and find the maximum value
|
||||||
int[] values = new int[positions.length];
|
int[] values = new int[positions.length];
|
||||||
|
int minLength = Integer.MAX_VALUE;
|
||||||
|
int largestValue = Integer.MAX_VALUE;
|
||||||
|
|
||||||
for (int i = 0; i < positions.length; i++) {
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
minLength = Math.min(minLength, positions[i].size());
|
||||||
|
|
||||||
if (indexes[i] < positions[i].size())
|
if (indexes[i] < positions[i].size())
|
||||||
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
||||||
else
|
else
|
||||||
return IntList.of();
|
return IntList.of();
|
||||||
|
|
||||||
|
largestValue = Math.min(largestValue, positions[i].getInt(positions[i].size() - 1) + offsets[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
||||||
// until they are equal to the maximum value, or until the end of the sequence is reached
|
// until they are equal to the maximum value, or until the end of the sequence is reached
|
||||||
int max = Integer.MIN_VALUE;
|
int currentMax = Integer.MIN_VALUE;
|
||||||
int successes = 0;
|
|
||||||
|
|
||||||
IntList ret = new IntArrayList();
|
int listMatches = 0;
|
||||||
|
int foundIntersections = 0;
|
||||||
|
|
||||||
|
IntList ret = new IntArrayList(Math.min(n, Math.max(1, minLength)));
|
||||||
|
|
||||||
outer:
|
outer:
|
||||||
for (int i = 0;; i = (i + 1) % positions.length)
|
for (int i = 0; currentMax <= largestValue; i = (i + 1) % positions.length)
|
||||||
{
|
{
|
||||||
if (successes == positions.length) {
|
if (listMatches == positions.length) {
|
||||||
ret.add(max);
|
ret.add(currentMax);
|
||||||
successes = 1;
|
if (++foundIntersections > n) return ret;
|
||||||
|
|
||||||
|
listMatches = 1;
|
||||||
|
|
||||||
if (indexes[i] < positions[i].size()) {
|
if (indexes[i] < positions[i].size()) {
|
||||||
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
||||||
|
|
||||||
// Update the maximum value, if necessary
|
// Update the maximum value, if necessary
|
||||||
max = Math.max(max, values[i]);
|
currentMax = Math.max(currentMax, values[i]);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (values[i] == max) {
|
} else if (values[i] == currentMax) {
|
||||||
successes++;
|
listMatches++;
|
||||||
} else {
|
} else {
|
||||||
successes = 1;
|
listMatches = 1;
|
||||||
|
|
||||||
// Discard values until we reach the maximum value seen so far,
|
// Discard values until we reach the maximum value seen so far,
|
||||||
// or until the end of the sequence is reached
|
// or until the end of the sequence is reached
|
||||||
while (values[i] < max) {
|
while (values[i] < currentMax) {
|
||||||
if (indexes[i] < positions[i].size()) {
|
if (indexes[i] < positions[i].size()) {
|
||||||
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
||||||
} else {
|
} else {
|
||||||
@@ -122,14 +149,13 @@ public class SequenceOperations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update the maximum value, if necessary
|
// Update the maximum value, if necessary
|
||||||
max = Math.max(max, values[i]);
|
currentMax = Math.max(currentMax, values[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Given each set of positions, one from each list, find the set with the smallest distance between them
|
/** Given each set of positions, one from each list, find the set with the smallest distance between them
|
||||||
* and return that distance. If any of the lists are empty, return 0.
|
* and return that distance. If any of the lists are empty, return 0.
|
||||||
* */
|
* */
|
||||||
@@ -146,10 +172,14 @@ public class SequenceOperations {
|
|||||||
public static int minDistance(IntList[] positions, int[] offsets) {
|
public static int minDistance(IntList[] positions, int[] offsets) {
|
||||||
if (positions.length <= 1)
|
if (positions.length <= 1)
|
||||||
return 0;
|
return 0;
|
||||||
|
if (positions.length == 1)
|
||||||
|
return 0;
|
||||||
|
|
||||||
int[] values = new int[positions.length];
|
int[] values = new int[positions.length];
|
||||||
int[] indexes = new int[positions.length];
|
int[] indexes = new int[positions.length];
|
||||||
|
|
||||||
|
int largestValue = 0;
|
||||||
|
|
||||||
for (int i = 0; i < positions.length; i++) {
|
for (int i = 0; i < positions.length; i++) {
|
||||||
// if any of the lists are empty, return MAX_VALUE
|
// if any of the lists are empty, return MAX_VALUE
|
||||||
|
|
||||||
@@ -158,6 +188,7 @@ public class SequenceOperations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
values[i] = positions[i].getInt(indexes[i]++) + offsets[i];
|
||||||
|
largestValue = Math.min(largestValue, positions[i].getInt(positions[i].size() - 1) + offsets[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
int minDist = Integer.MAX_VALUE;
|
int minDist = Integer.MAX_VALUE;
|
||||||
@@ -173,7 +204,7 @@ public class SequenceOperations {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (;;) {
|
do {
|
||||||
// For all the other indexes except maxI, update values[] with the largest value smaller than maxVal
|
// For all the other indexes except maxI, update values[] with the largest value smaller than maxVal
|
||||||
for (int idx = 0; idx < positions.length - 1; idx++) {
|
for (int idx = 0; idx < positions.length - 1; idx++) {
|
||||||
int i = (maxI + idx) % positions.length;
|
int i = (maxI + idx) % positions.length;
|
||||||
@@ -228,6 +259,8 @@ public class SequenceOperations {
|
|||||||
else {
|
else {
|
||||||
return minDist;
|
return minDist;
|
||||||
}
|
}
|
||||||
}
|
} while (maxVal <= largestValue);
|
||||||
|
|
||||||
|
return minDist;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -2,31 +2,38 @@ package nu.marginalia.bench;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
import nu.marginalia.sequence.VarintCodedSequence;
|
|
||||||
import org.openjdk.jmh.annotations.*;
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.util.Random;
|
||||||
|
|
||||||
public class SequenceBenchmarks {
|
public class SequenceBenchmarks {
|
||||||
|
|
||||||
@State(Scope.Benchmark)
|
@State(Scope.Benchmark)
|
||||||
public static class SequenceState {
|
public static class SequenceState {
|
||||||
VarintCodedSequence vcs;
|
IntList a;
|
||||||
GammaCodedSequence gcs;
|
IntList b;
|
||||||
IntList list;
|
IntList c;
|
||||||
ByteBuffer workArea;
|
|
||||||
int[] arrayValues;
|
|
||||||
int[] valueBuffer;
|
|
||||||
public SequenceState()
|
|
||||||
{
|
|
||||||
valueBuffer = new int[128];
|
|
||||||
|
|
||||||
workArea = ByteBuffer.allocate(65536);
|
|
||||||
arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 };
|
public SequenceState() {
|
||||||
list = new IntArrayList(arrayValues);
|
a = new IntArrayList();
|
||||||
vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
b = new IntArrayList();
|
||||||
gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738);
|
c = new IntArrayList();
|
||||||
|
|
||||||
|
var r = new Random(1000);
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
b.add(r.nextInt(0, 5000));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
c.add(r.nextInt(0, 5000));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
a.add(r.nextInt(0, 5000));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34,57 +41,17 @@ public class SequenceBenchmarks {
|
|||||||
@Warmup(iterations = 1)
|
@Warmup(iterations = 1)
|
||||||
@Benchmark
|
@Benchmark
|
||||||
@BenchmarkMode(Mode.Throughput)
|
@BenchmarkMode(Mode.Throughput)
|
||||||
public int vcsDecode(SequenceState state) {
|
public IntList intersect(SequenceState state) {
|
||||||
var iter = state.vcs.iterator();
|
return SequenceOperations.findIntersections(state.a, state.b, state.c);
|
||||||
int sum = 0;
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
sum += iter.nextInt();
|
|
||||||
}
|
}
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
//
|
|
||||||
// @Fork(value = 5, warmups = 5)
|
|
||||||
// @Warmup(iterations = 5)
|
|
||||||
// @Benchmark
|
|
||||||
// @BenchmarkMode(Mode.Throughput)
|
|
||||||
// public int listDecode2(SequenceState state) {
|
|
||||||
// var list = state.arrayValues;
|
|
||||||
// int sum = 0;
|
|
||||||
// for (int i = 0; i < list.length; i++) {
|
|
||||||
// sum += list[i];
|
|
||||||
// }
|
|
||||||
// return sum;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
@Fork(value = 1, warmups = 1)
|
@Fork(value = 1, warmups = 1)
|
||||||
@Warmup(iterations = 1)
|
@Warmup(iterations = 1)
|
||||||
@Benchmark
|
@Benchmark
|
||||||
@BenchmarkMode(Mode.Throughput)
|
@BenchmarkMode(Mode.Throughput)
|
||||||
public int gcsDecode(SequenceState state) {
|
public IntList intersect1(SequenceState state) {
|
||||||
var iter = state.gcs.iterator();
|
return SequenceOperations.findIntersections(state.a);
|
||||||
int sum = 0;
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
sum += iter.nextInt();
|
|
||||||
}
|
}
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// @Fork(value = 1, warmups = 1)
|
|
||||||
// @Warmup(iterations = 1)
|
|
||||||
// @Benchmark
|
|
||||||
// @BenchmarkMode(Mode.Throughput)
|
|
||||||
// public VarintCodedSequence vcsEncode(SequenceState state) {
|
|
||||||
// return VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// @Fork(value = 1, warmups = 1)
|
|
||||||
// @Warmup(iterations = 1)
|
|
||||||
// @Benchmark
|
|
||||||
// @BenchmarkMode(Mode.Throughput)
|
|
||||||
// public GammaCodedSequence gcsEncode(SequenceState state) {
|
|
||||||
// return GammaCodedSequence.generate(state.workArea, 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100);
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -11,7 +11,6 @@ import nu.marginalia.sequence.VarintCodedSequence;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
public class DocumentKeywordsBuilder {
|
public class DocumentKeywordsBuilder {
|
||||||
@@ -36,7 +35,7 @@ public class DocumentKeywordsBuilder {
|
|||||||
this(1600);
|
this(1600);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentKeywords build(ByteBuffer workArea) {
|
public DocumentKeywords build() {
|
||||||
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
final List<String> wordArray = new ArrayList<>(wordToMeta.size());
|
||||||
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
final TByteArrayList meta = new TByteArrayList(wordToMeta.size());
|
||||||
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
final List<VarintCodedSequence> positions = new ArrayList<>(wordToMeta.size());
|
||||||
|
@@ -13,7 +13,6 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@@ -56,7 +55,7 @@ class DocumentKeywordExtractorTest {
|
|||||||
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)")
|
||||||
);
|
);
|
||||||
|
|
||||||
var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024));
|
var keywordsBuilt = keywords.build();
|
||||||
|
|
||||||
Map<String, Byte> flags = new HashMap<>();
|
Map<String, Byte> flags = new HashMap<>();
|
||||||
Map<String, CodedSequence> positions = new HashMap<>();
|
Map<String, CodedSequence> positions = new HashMap<>();
|
||||||
|
@@ -16,7 +16,6 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@@ -94,8 +93,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
|
|
||||||
String domainName = domain.toString();
|
String domainName = domain.toString();
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(16384);
|
|
||||||
|
|
||||||
while (documentIterator.hasNext()) {
|
while (documentIterator.hasNext()) {
|
||||||
var document = documentIterator.next();
|
var document = documentIterator.next();
|
||||||
|
|
||||||
@@ -103,7 +100,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var wb = document.words.build(workArea);
|
var wb = document.words.build();
|
||||||
|
|
||||||
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
List<VarintCodedSequence> spanSequences = new ArrayList<>(wb.spans.size());
|
||||||
byte[] spanCodes = new byte[wb.spans.size()];
|
byte[] spanCodes = new byte[wb.spans.size()];
|
||||||
|
@@ -200,14 +200,16 @@ public class SearchOperator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (topdomain.equals("medium.com")) {
|
else if (topdomain.equals("medium.com")) {
|
||||||
|
int slashIndex = path.indexOf("/", 1);
|
||||||
|
if (slashIndex >= 0) {
|
||||||
if (!subdomain.isBlank()) {
|
if (!subdomain.isBlank()) {
|
||||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
String article = path.substring(path.indexOf("/", 1));
|
String article = path.substring(slashIndex);
|
||||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
@@ -231,13 +231,16 @@ public class SearchOperator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (topdomain.equals("medium.com")) {
|
else if (topdomain.equals("medium.com")) {
|
||||||
|
int slashIndex = path.indexOf("/", 1);
|
||||||
|
if (slashIndex >= 0) {
|
||||||
if (!subdomain.isBlank()) {
|
if (!subdomain.isBlank()) {
|
||||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
String article = path.substring(path.indexOf("/", 1));
|
String article = path.substring(slashIndex);
|
||||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return url;
|
return url;
|
||||||
|
@@ -82,7 +82,6 @@ public class SingleService {
|
|||||||
enum Service {
|
enum Service {
|
||||||
IndexService("index", "nu.marginalia.index.IndexMain"),
|
IndexService("index", "nu.marginalia.index.IndexMain"),
|
||||||
ControlService("control", "nu.marginalia.control.ControlMain"),
|
ControlService("control", "nu.marginalia.control.ControlMain"),
|
||||||
ExecutorService("executor", "nu.marginalia.executor.ExecutorMain"),
|
|
||||||
QueryService("query", "nu.marginalia.query.QueryMain"),
|
QueryService("query", "nu.marginalia.query.QueryMain"),
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@@ -14,7 +14,7 @@ import nu.marginalia.crawl.fetcher.DomainCookies;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
import nu.marginalia.index.IndexGrpcService;
|
import nu.marginalia.index.IndexQueryExecution;
|
||||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
@@ -25,6 +25,7 @@ import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
|||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
|
import nu.marginalia.index.results.IndexResultRankingService;
|
||||||
import nu.marginalia.index.searchset.SearchSetAny;
|
import nu.marginalia.index.searchset.SearchSetAny;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
@@ -89,10 +90,11 @@ public class IntegrationTest {
|
|||||||
@Inject
|
@Inject
|
||||||
StatefulIndex statefulIndex;
|
StatefulIndex statefulIndex;
|
||||||
@Inject
|
@Inject
|
||||||
IndexGrpcService indexGrpcService;
|
|
||||||
@Inject
|
|
||||||
DocumentDbReader documentDbReader;
|
DocumentDbReader documentDbReader;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
IndexResultRankingService rankingService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
QueryFactory queryFactory;
|
QueryFactory queryFactory;
|
||||||
|
|
||||||
@@ -222,7 +224,7 @@ public class IntegrationTest {
|
|||||||
|
|
||||||
System.out.println(indexRequest);
|
System.out.println(indexRequest);
|
||||||
|
|
||||||
var rs = indexGrpcService.executeSearch(new SearchParameters(indexRequest, new SearchSetAny()));
|
var rs = new IndexQueryExecution(new SearchParameters(indexRequest, new SearchSetAny()), rankingService, statefulIndex.get());
|
||||||
|
|
||||||
System.out.println(rs);
|
System.out.println(rs);
|
||||||
}
|
}
|
||||||
|
@@ -14,3 +14,5 @@
|
|||||||
2025-07-21: Deploy executor partition 1.
|
2025-07-21: Deploy executor partition 1.
|
||||||
2025-07-21: Deploy search.
|
2025-07-21: Deploy search.
|
||||||
2025-07-23: Redeploy all.
|
2025-07-23: Redeploy all.
|
||||||
|
2025-07-25: Redeploy index.
|
||||||
|
|
||||||
|
@@ -40,6 +40,7 @@ include 'code:index:index-journal'
|
|||||||
include 'code:index:query'
|
include 'code:index:query'
|
||||||
include 'code:index:index-forward'
|
include 'code:index:index-forward'
|
||||||
include 'code:index:index-reverse'
|
include 'code:index:index-reverse'
|
||||||
|
include 'code:index:index-perftest'
|
||||||
|
|
||||||
include 'code:libraries:array'
|
include 'code:libraries:array'
|
||||||
include 'code:libraries:array:cpp'
|
include 'code:libraries:array:cpp'
|
||||||
|
Reference in New Issue
Block a user