mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
195 Commits
deploy-030
...
edd453531e
Author | SHA1 | Date | |
---|---|---|---|
|
edd453531e | ||
|
096496ada1 | ||
|
8ca6209260 | ||
|
673c65d3c9 | ||
|
acb9ec7b15 | ||
|
47079e05db | ||
|
c93056e77f | ||
|
6f7530e807 | ||
|
87ce4a1b52 | ||
|
52194cbe7a | ||
|
fd1ac03c78 | ||
|
5e5b86efb4 | ||
|
f332ec6191 | ||
|
c25c1af437 | ||
|
eb0c911b45 | ||
|
1979870ce4 | ||
|
0ba2ea38e1 | ||
|
d6cfbceeea | ||
|
e369d200cc | ||
|
946d64c8da | ||
|
42f043a60f | ||
|
2f3950e0d5 | ||
|
61d803869e | ||
|
df6434d177 | ||
|
59519ed7c4 | ||
|
874fc2d250 | ||
|
69e8ec0eef | ||
|
a7eb5f54e6 | ||
|
b29ba3e228 | ||
|
5fa5029c60 | ||
|
4257f60f00 | ||
|
ce221d3a0e | ||
|
f0741142a3 | ||
|
0899e4d895 | ||
|
bbf7c5a1cb | ||
|
686a40e69b | ||
|
8af254f44f | ||
|
2c21bd9287 | ||
|
f9645e2f00 | ||
|
81e311b558 | ||
|
507c09146a | ||
|
f682425594 | ||
|
de67006c4f | ||
|
eea32bb7b4 | ||
|
e976940a4e | ||
|
b564b33028 | ||
|
1cca16a58e | ||
|
70b4ed6d81 | ||
|
45dc6412c1 | ||
|
b3b95edcb5 | ||
|
338d300e1a | ||
|
fa685bf1f4 | ||
|
d79a3e2b2a | ||
|
854382b2be | ||
|
8710adbc2a | ||
|
acdf7b4785 | ||
|
b5d27c1406 | ||
|
55eb7dc116 | ||
|
f0e8bc8baf | ||
|
91a6ad2337 | ||
|
9a182b9ddb | ||
|
fefbcf15ce | ||
|
9a789bf62d | ||
|
0525303b68 | ||
|
6953d65de5 | ||
|
a7a18ced2e | ||
|
7c94c941b2 | ||
|
ea99b62356 | ||
|
3dc21d34d8 | ||
|
51912e0176 | ||
|
de1b4d5372 | ||
|
50ac926060 | ||
|
d711ee75b5 | ||
|
291ff0c4de | ||
|
2fd2710355 | ||
|
e3b957063d | ||
|
aee262e5f6 | ||
|
4a98a3c711 | ||
|
68f52ca350 | ||
|
2a2d951c2f | ||
|
379a1be074 | ||
|
827aadafcd | ||
|
aa7679d6ce | ||
|
6fe6de766d | ||
|
4245ac4c07 | ||
|
1c49a0f5ad | ||
|
9a6e5f646d | ||
|
fa92994a31 | ||
|
bc49406881 | ||
|
90325be447 | ||
|
dc89587af3 | ||
|
7b552afd6b | ||
|
73557edc67 | ||
|
83919e448a | ||
|
6f5b75b84d | ||
|
db315e2813 | ||
|
e9977e08b7 | ||
|
1df3757e5f | ||
|
ca283f9684 | ||
|
85360e61b2 | ||
|
e2ccff21bc | ||
|
c5b5b0c699 | ||
|
9a65946e22 | ||
|
1d2ab21e27 | ||
|
0610cc19ad | ||
|
a676306a7f | ||
|
8d68cd14fb | ||
|
4773c5a52b | ||
|
74bd562ae4 | ||
|
c9751287b0 | ||
|
5da24e3fc4 | ||
|
20a4e86eec | ||
|
477a184948 | ||
|
8940ce99db | ||
|
0ac0fa4dca | ||
|
942f15ef14 | ||
|
f668f33d5b | ||
|
6789975cd2 | ||
|
c3ba608776 | ||
|
733d2687fe | ||
|
f6daac8ed0 | ||
|
c2eeee4a06 | ||
|
3b0c701df4 | ||
|
c6fb2db43b | ||
|
9bc8fe05ae | ||
|
440ffcf6f8 | ||
|
b07709cc72 | ||
|
9a6acdcbe0 | ||
|
23b9b0bf1b | ||
|
749c8ed954 | ||
|
9f4b6939ca | ||
|
1d08e44e8d | ||
|
fc2e156e78 | ||
|
5e68a89e9f | ||
|
d380661307 | ||
|
cccdf5c329 | ||
|
f085b4ea12 | ||
|
e208f7d3ba | ||
|
b577085cb2 | ||
|
b9240476f6 | ||
|
8f50f86d0b | ||
|
e3b7ead7a9 | ||
|
9a845ba604 | ||
|
b9381f1603 | ||
|
6a60127267 | ||
|
e8ffcfbb19 | ||
|
caf0850f81 | ||
|
62e3bb675e | ||
|
4dc3e7da7a | ||
|
92b09883ec | ||
|
87082b4ef8 | ||
|
84d3f6087f | ||
|
f93ba371a5 | ||
|
5eec27c68d | ||
|
ab01576f91 | ||
|
054e5ccf44 | ||
|
4351ea5128 | ||
|
49cfa3a5e9 | ||
|
683854b23f | ||
|
e880fa8945 | ||
|
2482dc572e | ||
|
4589f11898 | ||
|
e43b6e610b | ||
|
4772117a1f | ||
|
3fc7ea521c | ||
|
4372f5af03 | ||
|
4ad89b6c75 | ||
|
ad0519e031 | ||
|
596ece1230 | ||
|
07b6e1585b | ||
|
cb5e2778eb | ||
|
8f5ea7896c | ||
|
76c398e0b1 | ||
|
4a94f04a8d | ||
|
df72f670d4 | ||
|
eaa22c2f5a | ||
|
7be173aeca | ||
|
36685bdca7 | ||
|
ad04057609 | ||
|
eb76ae22e2 | ||
|
4b858ab341 | ||
|
c6e3c8aa3b | ||
|
9128d3907c | ||
|
4ef16d13d4 | ||
|
838a5626ec | ||
|
6b426209c7 | ||
|
452b5731d9 | ||
|
c91cf49630 | ||
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 |
@@ -6,6 +6,7 @@ plugins {
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.5' apply(false)
|
||||
id 'com.adarshr.test-logger' version '4.0.0'
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
@@ -31,7 +32,10 @@ subprojects.forEach {it ->
|
||||
jvmArgs += ['--enable-preview']
|
||||
}
|
||||
it.tasks.withType(Test).configureEach {
|
||||
jvmArgs += ['--enable-preview']
|
||||
jvmArgs += ['--enable-preview',
|
||||
'--enable-native-access=ALL-UNNAMED',
|
||||
'--sun-misc-unsafe-memory-access=allow',
|
||||
'-Dsystem.uringQueueCount=1']
|
||||
}
|
||||
|
||||
// Enable reproducible builds for the entire project
|
||||
|
@@ -6,7 +6,6 @@ import com.google.inject.name.Named;
|
||||
import gnu.trove.list.TLongList;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -14,7 +13,6 @@ import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@@ -104,7 +102,7 @@ public class DocumentDbReader {
|
||||
}
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
SELECT ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||
FROM DOCUMENT WHERE ID = ?
|
||||
""")) {
|
||||
for (int i = 0; i < ids.size(); i++) {
|
||||
@@ -118,6 +116,7 @@ public class DocumentDbReader {
|
||||
url,
|
||||
rs.getString("TITLE"),
|
||||
rs.getString("DESCRIPTION"),
|
||||
rs.getString("LANGUAGE"),
|
||||
rs.getDouble("QUALITY"),
|
||||
rs.getString("FORMAT"),
|
||||
rs.getInt("FEATURES"),
|
||||
|
@@ -41,8 +41,8 @@ public class DocumentDbWriter {
|
||||
public void add(List<DocdbUrlDetail> docdbUrlDetail) throws SQLException {
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, LANGUAGE, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
|
||||
int i = 0;
|
||||
@@ -54,15 +54,16 @@ public class DocumentDbWriter {
|
||||
|
||||
stmt.setString(3, document.title());
|
||||
stmt.setString(4, document.description());
|
||||
stmt.setInt(5, document.wordsTotal());
|
||||
stmt.setString(6, document.format());
|
||||
stmt.setInt(7, document.features());
|
||||
stmt.setLong(8, document.dataHash());
|
||||
stmt.setDouble(9, document.urlQuality());
|
||||
stmt.setString(5, document.language());
|
||||
stmt.setInt(6, document.wordsTotal());
|
||||
stmt.setString(7, document.format());
|
||||
stmt.setInt(8, document.features());
|
||||
stmt.setLong(9, document.dataHash());
|
||||
stmt.setDouble(10, document.urlQuality());
|
||||
if (document.pubYear() == null) {
|
||||
stmt.setInt(10, 0);
|
||||
stmt.setInt(11, 0);
|
||||
} else {
|
||||
stmt.setInt(10, document.pubYear());
|
||||
stmt.setInt(11, document.pubYear());
|
||||
}
|
||||
|
||||
stmt.addBatch();
|
||||
|
@@ -6,6 +6,7 @@ public record DocdbUrlDetail(long urlId,
|
||||
EdgeUrl url,
|
||||
String title,
|
||||
String description,
|
||||
String language,
|
||||
double urlQuality,
|
||||
String format,
|
||||
int features,
|
||||
|
@@ -6,6 +6,7 @@ CREATE TABLE DOCUMENT (
|
||||
STATE INT,
|
||||
TITLE TEXT NOT NULL,
|
||||
DESCRIPTION TEXT NOT NULL,
|
||||
LANGUAGE TEXT NOT NULL,
|
||||
|
||||
WORDS_TOTAL INTEGER NOT NULL,
|
||||
FORMAT TEXT NOT NULL,
|
||||
|
@@ -23,6 +23,7 @@ public class DocumentDbWriterTest {
|
||||
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||
"Test",
|
||||
"This is a test",
|
||||
"en",
|
||||
-4.,
|
||||
"XHTML",
|
||||
5,
|
||||
|
@@ -22,7 +22,6 @@ dependencies {
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
@@ -34,7 +33,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
@@ -5,7 +5,6 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
@@ -57,7 +56,7 @@ public class ProcessSpawnerService {
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
INDEX_CONSTRUCTOR("nu.marginalia.index.IndexConstructorMain"),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
@@ -66,6 +65,9 @@ public class ProcessSpawnerService {
|
||||
ProcessId(Class<? extends ProcessMainClass> mainClass) {
|
||||
this.mainClass = mainClass.getName();
|
||||
}
|
||||
ProcessId(String mainClassFullName) {
|
||||
this.mainClass = mainClassFullName;
|
||||
}
|
||||
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
|
@@ -22,7 +22,6 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,8 +2,8 @@ package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@@ -28,6 +28,7 @@ public class QueryProtobufCodec {
|
||||
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
builder.setLangIsoCode(query.langIsoCode);
|
||||
|
||||
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||
|
||||
@@ -76,6 +77,7 @@ public class QueryProtobufCodec {
|
||||
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
builder.setLangIsoCode(query.langIsoCode);
|
||||
|
||||
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||
|
||||
@@ -114,6 +116,7 @@ public class QueryProtobufCodec {
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||
request.getLangIsoCode(),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -335,7 +338,8 @@ public class QueryProtobufCodec {
|
||||
.setPagination(RpcQsQueryPagination.newBuilder()
|
||||
.setPage(params.page())
|
||||
.setPageSize(Math.min(100, params.limits().getResultsTotal()))
|
||||
.build());
|
||||
.build())
|
||||
.setLangIsoCode(params.langIsoCode());
|
||||
|
||||
if (params.nearDomain() != null)
|
||||
builder.setNearDomain(params.nearDomain());
|
||||
|
@@ -1,19 +1,24 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.List;
|
||||
|
||||
public class ProcessedQuery {
|
||||
public final SearchSpecification specs;
|
||||
public final List<String> searchTermsHuman;
|
||||
public final String domain;
|
||||
public final String langIsoCode;
|
||||
|
||||
public ProcessedQuery(SearchSpecification specs, List<String> searchTermsHuman, String domain) {
|
||||
public ProcessedQuery(SearchSpecification specs,
|
||||
List<String> searchTermsHuman,
|
||||
String domain,
|
||||
String langIsoCode) {
|
||||
this.specs = specs;
|
||||
this.searchTermsHuman = searchTermsHuman;
|
||||
this.domain = domain;
|
||||
this.langIsoCode = langIsoCode;
|
||||
}
|
||||
|
||||
public ProcessedQuery(SearchSpecification justSpecs) {
|
||||
this(justSpecs, List.of(), null);
|
||||
this(justSpecs, List.of(), null, "en");
|
||||
}
|
||||
}
|
||||
|
@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
@@ -26,10 +24,11 @@ public record QueryParams(
|
||||
QueryStrategy queryStrategy,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
NsfwFilterTier filterTier,
|
||||
String langIsoCode,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier, String langIsoCode) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -45,6 +44,7 @@ public record QueryParams(
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
filterTier,
|
||||
langIsoCode,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.query.limit;
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum QueryStrategy {
|
||||
SENTENCE,
|
@@ -2,8 +2,6 @@ package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.query.limit;
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public record SpecificationLimit(SpecificationLimitType type, int value) {
|
||||
public boolean isNone() {
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.query.limit;
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum SpecificationLimitType {
|
||||
NONE,
|
@@ -34,6 +34,7 @@ message RpcQsQuery {
|
||||
RpcQsQueryPagination pagination = 17;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||
string langIsoCode = 19;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
@@ -88,6 +89,7 @@ message RpcIndexQuery {
|
||||
RpcResultRankingParameters parameters = 12;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||
string langIsoCode = 14;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.index.client;
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
@@ -22,18 +22,13 @@ dependencies {
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:index:query')
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:processes:converting-process:ft-keyword-extraction')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@@ -8,8 +8,8 @@ import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
@@ -167,7 +167,7 @@ public class QueryFactory {
|
||||
specs.query.searchTermsPriority.addAll(params.tacitPriority());
|
||||
specs.query.searchTermsExclude.addAll(params.tacitExcludes());
|
||||
|
||||
return new ProcessedQuery(specs, searchTermsHuman, domain);
|
||||
return new ProcessedQuery(specs, searchTermsHuman, domain, params.langIsoCode());
|
||||
}
|
||||
|
||||
private void analyzeSearchTerm(List<String> problems, String str, String displayStr) {
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.util.transform_list.TransformList;
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
|
||||
public sealed interface QueryToken {
|
||||
String str();
|
||||
|
@@ -3,14 +3,9 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@@ -60,6 +55,7 @@ public class QueryFactoryTest {
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
"en",
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
@@ -216,6 +212,12 @@ public class QueryFactoryTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testExpansion10() {
|
||||
var subquery = parseAndGetSpecs("when was captain james cook born");
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContractionWordNum() {
|
||||
var subquery = parseAndGetSpecs("glove 80");
|
||||
|
@@ -22,8 +22,13 @@ dependencies {
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:skiplist')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:random-write-funnel')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:config')
|
||||
@@ -32,11 +37,9 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:index:index-forward')
|
||||
implementation project(':code:index:index-reverse')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
||||
|
||||
@@ -74,7 +77,7 @@ dependencies {
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
testImplementation project(':code:libraries:braille-block-punch-cards')
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
@@ -1,38 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.fastutil
|
||||
implementation libs.trove
|
||||
implementation libs.slop
|
||||
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@@ -1,33 +0,0 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ForwardIndexFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case DOC_ID -> switch (version) {
|
||||
case NEXT -> basePath.resolve("fwd-doc-id.dat.next");
|
||||
case CURRENT -> basePath.resolve("fwd-doc-id.dat");
|
||||
};
|
||||
case DOC_DATA -> switch (version) {
|
||||
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
|
||||
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
|
||||
};
|
||||
case SPANS_DATA -> switch (version) {
|
||||
case NEXT -> basePath.resolve("fwd-spans.dat.next");
|
||||
case CURRENT -> basePath.resolve("fwd-spans.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT
|
||||
}
|
||||
|
||||
public enum FileIdentifier {
|
||||
DOC_DATA,
|
||||
SPANS_DATA,
|
||||
DOC_ID
|
||||
}
|
||||
}
|
@@ -1,70 +0,0 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
@Deprecated
|
||||
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
|
||||
private final FileChannel spansFileChannel;
|
||||
|
||||
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
|
||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
// Decode the size and offset from the encoded offset
|
||||
long size = SpansCodec.decodeSize(encodedOffset);
|
||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||
|
||||
// Allocate a buffer from the arena
|
||||
var buffer = arena.allocate(size).asByteBuffer();
|
||||
buffer.clear();
|
||||
while (buffer.hasRemaining()) {
|
||||
spansFileChannel.read(buffer, offset + buffer.position());
|
||||
}
|
||||
buffer.flip();
|
||||
|
||||
// Read the number of spans in the document
|
||||
int count = buffer.get();
|
||||
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
// Decode each span
|
||||
while (count-- > 0) {
|
||||
byte code = buffer.get();
|
||||
short len = buffer.getShort();
|
||||
|
||||
ByteBuffer data = buffer.slice(buffer.position(), len);
|
||||
ret.accept(code, new VarintCodedSequence(data));
|
||||
|
||||
// Reset the buffer position to the end of the span
|
||||
buffer.position(buffer.position() + len);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||
for (int i = 0; i < encodedOffsets.length; i++) {
|
||||
if (encodedOffsets[i] >= 0) {
|
||||
ret[i] = readSpans(arena, encodedOffsets[i]);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
spansFileChannel.close();
|
||||
}
|
||||
|
||||
}
|
@@ -1,122 +0,0 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public class IndexSpansReaderPlain implements IndexSpansReader {
|
||||
private final FileChannel[] spansFileChannels;
|
||||
private final ForkJoinPool forkJoinPool;
|
||||
|
||||
public IndexSpansReaderPlain(Path spansFile) throws IOException {
|
||||
this.spansFileChannels = new FileChannel[8];
|
||||
for (int i = 0; i < spansFileChannels.length; i++) {
|
||||
spansFileChannels[i] = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||
}
|
||||
forkJoinPool = new ForkJoinPool(spansFileChannels.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
// Decode the size and offset from the encoded offset
|
||||
long size = SpansCodec.decodeSize(encodedOffset);
|
||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||
|
||||
var ms = arena.allocate(size, 4);
|
||||
// Allocate a buffer from the arena
|
||||
var buffer = ms.asByteBuffer();
|
||||
while (buffer.hasRemaining()) {
|
||||
spansFileChannels[0].read(buffer, offset + buffer.position());
|
||||
}
|
||||
|
||||
return decode(ms);
|
||||
}
|
||||
|
||||
public DocumentSpans decode(MemorySegment ms) {
|
||||
int count = ms.get(ValueLayout.JAVA_INT, 0);
|
||||
int pos = 4;
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
// Decode each span
|
||||
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
|
||||
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
|
||||
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
|
||||
|
||||
IntArrayList values = new IntArrayList(len);
|
||||
|
||||
pos += 4;
|
||||
for (int i = 0; i < len; i++) {
|
||||
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
|
||||
}
|
||||
ret.accept(code, values);
|
||||
pos += 4*len;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
@Override
|
||||
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||
long totalSize = 0;
|
||||
int numJobs = 0;
|
||||
for (long offset : encodedOffsets) {
|
||||
if (offset < 0)
|
||||
continue;
|
||||
totalSize += SpansCodec.decodeSize(offset);
|
||||
numJobs++;
|
||||
}
|
||||
|
||||
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||
if (numJobs == 0) return ret;
|
||||
|
||||
CountDownLatch latch = new CountDownLatch(numJobs);
|
||||
MemorySegment segment = arena.allocate(totalSize, 8);
|
||||
|
||||
long bufferOffset = 0;
|
||||
for (int idx = 0; idx < encodedOffsets.length; idx++) {
|
||||
long size = SpansCodec.decodeSize(encodedOffsets[idx]);
|
||||
long start = SpansCodec.decodeStartOffset(encodedOffsets[idx]);
|
||||
|
||||
MemorySegment slice = segment.asSlice(bufferOffset, size);
|
||||
bufferOffset += size;
|
||||
|
||||
int i = idx;
|
||||
forkJoinPool.execute(() -> {
|
||||
var buffer = slice.asByteBuffer();
|
||||
try {
|
||||
spansFileChannels[i% spansFileChannels.length].read(buffer, start);
|
||||
ret[i] = decode(slice);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
finally {
|
||||
latch.countDown();
|
||||
}
|
||||
});
|
||||
}
|
||||
try {
|
||||
latch.await();
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
for (var spansFileChannel : spansFileChannels) {
|
||||
spansFileChannel.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -1,21 +0,0 @@
|
||||
# Forward Index
|
||||
|
||||
The forward index contains a mapping from document id to various forms of document metadata.
|
||||
|
||||
In practice, the forward index consists of two files, an `id` file and a `data` file.
|
||||
|
||||
The `id` file contains a list of sorted document ids, and the `data` file contains
|
||||
metadata for each document id, in the same order as the `id` file, with a fixed
|
||||
size record containing data associated with each document id.
|
||||
|
||||
Each record contains a binary encoded [DocumentMetadata](../../common/model/java/nu/marginalia/model/idx/DocumentMetadata.java) object,
|
||||
as well as a [HtmlFeatures](../../common/model/java/nu/marginalia/model/crawl/HtmlFeature.java) bitmask.
|
||||
|
||||
Unlike the reverse index, the forward index is not split into two tiers, and the data is in the same
|
||||
order as it is in the source data, and the cardinality of the document IDs is assumed to fit in memory,
|
||||
so it's relatively easy to construct.
|
||||
|
||||
## Central Classes
|
||||
|
||||
* [ForwardIndexConverter](java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java) constructs the index.
|
||||
* [ForwardIndexReader](java/nu/marginalia/index/forward/ForwardIndexReader.java) interrogates the index.
|
@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
@@ -2,11 +2,10 @@ package nu.marginalia.index.journal;
|
||||
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
|
||||
public record IndexJournal(Path journalDir) {
|
||||
|
||||
@@ -47,4 +46,21 @@ public record IndexJournal(Path journalDir) {
|
||||
|
||||
return instances;
|
||||
}
|
||||
|
||||
public Set<String> languages() {
|
||||
try {
|
||||
Set<String> languages = new HashSet<>(languages());
|
||||
|
||||
for (var instance : pages()) {
|
||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
|
||||
languages.addAll(instance.openLanguageIsoCode(slopTable).getDictionary());
|
||||
}
|
||||
}
|
||||
|
||||
return languages;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to read langauges from index journal");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -6,17 +6,22 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public record IndexJournalPage(Path baseDir, int page) {
|
||||
public static IntColumn features = new IntColumn("features", StorageType.PLAIN);
|
||||
public static IntColumn size = new IntColumn("size", StorageType.PLAIN);
|
||||
|
||||
public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN);
|
||||
public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN);
|
||||
|
||||
public static EnumColumn languageIsoCode = new EnumColumn("languageIsoCode", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||
|
||||
public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD);
|
||||
public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD);
|
||||
public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD);
|
||||
@@ -24,6 +29,7 @@ public record IndexJournalPage(Path baseDir, int page) {
|
||||
public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD);
|
||||
public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
|
||||
public IndexJournalPage {
|
||||
if (!baseDir.toFile().isDirectory()) {
|
||||
throw new IllegalArgumentException("Invalid base directory: " + baseDir);
|
||||
@@ -46,6 +52,9 @@ public record IndexJournalPage(Path baseDir, int page) {
|
||||
return size.open(table);
|
||||
}
|
||||
|
||||
public EnumColumn.Reader openLanguageIsoCode(SlopTable table) throws IOException {
|
||||
return languageIsoCode.open(table);
|
||||
}
|
||||
|
||||
public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException {
|
||||
return termIds.open(table);
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
@@ -8,6 +8,7 @@ import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.array.LongArrayColumn;
|
||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -27,8 +28,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
|
||||
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||
private final ByteArrayColumn.Writer spanCodesWriter;
|
||||
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
private final EnumColumn.Writer languagesWriter;
|
||||
|
||||
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
|
||||
|
||||
@@ -50,14 +50,17 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
|
||||
spanCodesWriter = IndexJournalPage.spanCodes.create(this);
|
||||
spansWriter = IndexJournalPage.spans.create(this);
|
||||
|
||||
languagesWriter = IndexJournalPage.languageIsoCode.create(this);
|
||||
}
|
||||
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection, KeywordHasher hasher) throws IOException {
|
||||
|
||||
combinedIdWriter.put(combinedId);
|
||||
featuresWriter.put(keywordsProjection.htmlFeatures());
|
||||
sizeWriter.put(keywordsProjection.length());
|
||||
documentMetaWriter.put(keywordsProjection.documentMetadata());
|
||||
languagesWriter.put(keywordsProjection.languageIsoCode());
|
||||
|
||||
// -- write keyword data --
|
||||
|
||||
@@ -66,7 +69,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
// termIds are the special hashes of the keywords
|
||||
long[] termIds = new long[keywordsProjection.words().size()];
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
termIds[i] = hash.hashKeyword(keywords.get(i));
|
||||
termIds[i] = hasher.hashKeyword(keywords.get(i));
|
||||
}
|
||||
|
||||
termIdsWriter.put(termIds);
|
||||
@@ -87,6 +90,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
termIdsWriter.close();
|
||||
termMetadataWriter.close();
|
||||
termPositionsWriter.close();
|
||||
languagesWriter.close();
|
||||
spansWriter.close();
|
||||
spanCodesWriter.close();
|
||||
}
|
||||
|
@@ -21,13 +21,11 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:native')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:index')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-forward')
|
||||
implementation project(':code:index:index-reverse')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
@@ -0,0 +1,262 @@
|
||||
package nu.marginalia.index.perftest;
|
||||
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.uring.UringFileReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class IoPatternsMain {
|
||||
|
||||
static void testBuffered(int sz, int small, int large, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
long fileSize = Files.size(largeFile);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
if (small == large) {
|
||||
segments.add(Arena.ofAuto().allocate(small));
|
||||
}
|
||||
else {
|
||||
segments.add(Arena.ofAuto().allocate(r.nextInt(small, large)));
|
||||
}
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
try (UringFileReader reader = new UringFileReader(largeFile, false)) {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("B"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iter++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(r.nextLong(0, fileSize - 256));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
reader.read(segments, offsets);
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
static void testBufferedPread(int sz, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
long fileSize = Files.size(largeFile);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
segments.add(Arena.ofAuto().allocate(r.nextInt(24, 256)));
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
int fd = -1;
|
||||
try {
|
||||
fd = LinuxSystemCalls.openBuffered(largeFile);
|
||||
LinuxSystemCalls.fadviseRandom(fd);
|
||||
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("BP"+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iter++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(r.nextLong(0, fileSize - 256));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
LinuxSystemCalls.readAt(fd, segments.get(i), offsets.get(i));
|
||||
}
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
}
|
||||
}
|
||||
finally {
|
||||
LinuxSystemCalls.closeFd(fd);
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void testDirect(int blockSize, int sz, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
|
||||
|
||||
Random r = new Random();
|
||||
List<MemorySegment> segments = new ArrayList<>();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
segments.add(Arena.ofAuto().allocate(blockSize, blockSize));
|
||||
}
|
||||
List<Long> offsets = new ArrayList<>();
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
try (UringFileReader reader = new UringFileReader(largeFile, true)) {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("DN"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iters++;
|
||||
}
|
||||
|
||||
offsets.clear();
|
||||
for (int i = 0; i < sz; i++) {
|
||||
offsets.add(blockSize * r.nextLong(0, fileSizeBlocks));
|
||||
}
|
||||
|
||||
long st = System.nanoTime();
|
||||
reader.read(segments, offsets);
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void testDirect1(int blockSize, int iters) {
|
||||
try {
|
||||
Path largeFile = Path.of("/home/vlofgren/largefile.dat");
|
||||
int fileSizeBlocks = (int) ((Files.size(largeFile) & -blockSize) / blockSize);
|
||||
|
||||
Random r = new Random();
|
||||
MemorySegment segment = Arena.global().allocate(blockSize, blockSize);
|
||||
|
||||
long[] samples = new long[1000];
|
||||
int si = 0;
|
||||
|
||||
int fd = LinuxSystemCalls.openDirect(largeFile);
|
||||
if (fd < 0) {
|
||||
throw new IOException("open failed");
|
||||
}
|
||||
try {
|
||||
for (int iter = 0; iter < iters; ) {
|
||||
if (si == samples.length) {
|
||||
Arrays.sort(samples);
|
||||
double p1 = samples[10] / 1_000.;
|
||||
double p10 = samples[100] / 1_000.;
|
||||
double p90 = samples[900] / 1_000.;
|
||||
double p99 = samples[990] / 1_000.;
|
||||
double avg = LongStream.of(samples).average().getAsDouble() / 1000.;
|
||||
System.out.println("D1"+blockSize+"\t"+avg+"\t"+p1 + " " + p10 + " " + p90 + " " + p99);
|
||||
si = 0;
|
||||
iters++;
|
||||
}
|
||||
|
||||
|
||||
long st = System.nanoTime();
|
||||
int ret;
|
||||
long readOffset = blockSize * r.nextLong(0, fileSizeBlocks);
|
||||
if (blockSize != (ret = LinuxSystemCalls.readAt(fd, segment, readOffset))) {
|
||||
throw new IOException("pread failed: " + ret);
|
||||
}
|
||||
long et = System.nanoTime();
|
||||
|
||||
samples[si++] = et - st;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
finally {
|
||||
LinuxSystemCalls.closeFd(fd);
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 32, 65536,1000));
|
||||
Thread.ofPlatform().start(() -> testDirect(8192*4, 128,1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(128, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBufferedPread(128, 1000));
|
||||
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect1(1024*1024, 1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 512,1000));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testDirect(512, 100));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(512, 1000));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(100));
|
||||
// Thread.ofPlatform().start(() -> testBuffered(100));
|
||||
|
||||
for (;;);
|
||||
// testBuffered(100);
|
||||
}
|
||||
}
|
@@ -9,21 +9,20 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.IndexQueryExecution;
|
||||
import nu.marginalia.index.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.StatefulIndex;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.results.DomainRankingOverrides;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.reverse.FullReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.WordLexicon;
|
||||
import nu.marginalia.index.reverse.query.IndexQuery;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
@@ -38,6 +37,7 @@ import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
public class PerfTestMain {
|
||||
static Duration warmupTime = Duration.ofMinutes(1);
|
||||
@@ -64,6 +64,8 @@ public class PerfTestMain {
|
||||
case "lookup" -> runLookup(indexDir, homeDir, query);
|
||||
case "execution" -> runExecution(indexDir, homeDir, query);
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
catch (NumberFormatException e) {
|
||||
System.err.println("Arguments: data-dir index-dir query");
|
||||
@@ -87,13 +89,13 @@ public class PerfTestMain {
|
||||
),
|
||||
new FullReverseIndexReader(
|
||||
"full",
|
||||
indexDir.resolve("ir/rev-words.dat"),
|
||||
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-en.dat"))),
|
||||
indexDir.resolve("ir/rev-docs.dat"),
|
||||
new PositionsFileReader(indexDir.resolve("ir/rev-positions.dat"))
|
||||
indexDir.resolve("ir/rev-positions.dat")
|
||||
),
|
||||
new PrioReverseIndexReader(
|
||||
"prio",
|
||||
indexDir.resolve("ir/rev-prio-words.dat"),
|
||||
List.of(new WordLexicon("en", indexDir.resolve("ir/rev-words-prio-en.dat"))),
|
||||
indexDir.resolve("ir/rev-prio-docs.dat")
|
||||
)
|
||||
);
|
||||
@@ -118,8 +120,7 @@ public class PerfTestMain {
|
||||
|
||||
public static void runValuation(Path homeDir,
|
||||
Path indexDir,
|
||||
String rawQuery) throws IOException, SQLException
|
||||
{
|
||||
String rawQuery) throws IOException, SQLException, TimeoutException {
|
||||
|
||||
CombinedIndexReader indexReader = createCombinedIndexReader(indexDir);
|
||||
QueryFactory queryFactory = createQueryFactory(homeDir);
|
||||
@@ -131,54 +132,42 @@ public class PerfTestMain {
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
|
||||
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||
var rankingContext = SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny());
|
||||
List<IndexQuery> queries = indexReader.createQueries(rankingContext);
|
||||
|
||||
TLongArrayList allResults = new TLongArrayList();
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(512);
|
||||
|
||||
for (var query : queries) {
|
||||
while (query.hasMore() && allResults.size() < 4096 ) {
|
||||
while (query.hasMore() && allResults.size() < 512 ) {
|
||||
query.getMoreResults(buffer);
|
||||
allResults.addAll(buffer.copyData());
|
||||
}
|
||||
if (allResults.size() >= 4096)
|
||||
if (allResults.size() >= 512)
|
||||
break;
|
||||
}
|
||||
allResults.sort();
|
||||
if (allResults.size() > 4096) {
|
||||
allResults.subList(4096, allResults.size()).clear();
|
||||
if (allResults.size() > 512) {
|
||||
allResults.subList(512, allResults.size()).clear();
|
||||
}
|
||||
|
||||
var docIds = new CombinedDocIdList(allResults.toArray());
|
||||
var rankingContext = ResultRankingContext.create(indexReader, searchParameters);
|
||||
var rankingData = rankingService.prepareRankingData(rankingContext, new CombinedDocIdList(allResults.toArray()));
|
||||
|
||||
System.out.println("Running warmup loop!");
|
||||
int sum = 0;
|
||||
|
||||
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
sum += rankingService.rankResults(rankingContext, docIds, false).size();
|
||||
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
System.out.println("Warmup complete after " + iter + " iters!");
|
||||
|
||||
runEndTime = Instant.now().plus(runTime);
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> times = new ArrayList<>();
|
||||
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
long start = System.nanoTime();
|
||||
sum2 += rankingService.rankResults(rankingContext, docIds, false).size();
|
||||
sum2 += rankingService.rankResults(rankingContext, rankingData, false).size();
|
||||
long end = System.nanoTime();
|
||||
times.add((end - start)/1_000_000.);
|
||||
|
||||
@@ -186,14 +175,19 @@ public class PerfTestMain {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||
System.out.println("Best times: " + (allResults.size() / 4096.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
|
||||
System.out.println("Best times: " + (allResults.size() / 512.) * times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
System.out.println("Warmup sum: " + sum);
|
||||
System.out.println("Main sum: " + sum2);
|
||||
System.out.println(docIds.size());
|
||||
System.out.println(rankingData.size());
|
||||
}
|
||||
|
||||
public static void runExecution(Path homeDir,
|
||||
@@ -210,46 +204,39 @@ public class PerfTestMain {
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
System.out.println("Running warmup loop!");
|
||||
int sum = 0;
|
||||
|
||||
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
|
||||
execution.run();
|
||||
sum += execution.itemsProcessed();
|
||||
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
System.out.println("Warmup complete after " + iter + " iters!");
|
||||
|
||||
runEndTime = Instant.now().plus(runTime);
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> rates = new ArrayList<>();
|
||||
List<Double> times = new ArrayList<>();
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
var execution = new IndexQueryExecution(searchParameters, rankingService, indexReader);
|
||||
var execution = new IndexQueryExecution(indexReader, rankingService, SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny()), 1);
|
||||
long start = System.nanoTime();
|
||||
execution.run();
|
||||
long end = System.nanoTime();
|
||||
sum2 += execution.itemsProcessed();
|
||||
rates.add(execution.itemsProcessed() / ((end - start)/1_000_000_000.));
|
||||
|
||||
times.add((end - start)/1_000_000.);
|
||||
indexReader.reset();
|
||||
if ((iter % 100) == 0) {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best rates: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Benchmark complete after " + iter + " iters!");
|
||||
System.out.println("Best counts: " + rates.stream().mapToDouble(Double::doubleValue).map(i -> -i).sorted().map(i -> -i).limit(3).average().orElse(-1));
|
||||
System.out.println("Warmup sum: " + sum);
|
||||
@@ -270,42 +257,25 @@ public class PerfTestMain {
|
||||
.setResultsByDomain(10)
|
||||
.setFetchSize(4096)
|
||||
.build();
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
SearchSpecification parsedQuery = queryFactory.createQuery(new QueryParams(rawQuery, queryLimits, "NONE", NsfwFilterTier.OFF, "en"), PrototypeRankingParameters.sensibleDefaults()).specs;
|
||||
|
||||
System.out.println("Query compiled to: " + parsedQuery.query.compiledQuery);
|
||||
|
||||
SearchParameters searchParameters = new SearchParameters(parsedQuery, new SearchSetAny());
|
||||
SearchContext searchContext = SearchContext.create(indexReader, new KeywordHasher.AsciiIsh(), parsedQuery, new SearchSetAny());
|
||||
|
||||
|
||||
Instant runEndTime = Instant.now().plus(warmupTime);
|
||||
Instant runEndTime = Instant.now().plus(runTime);
|
||||
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(4096);
|
||||
LongQueryBuffer buffer = new LongQueryBuffer(512);
|
||||
int sum1 = 0;
|
||||
int iter;
|
||||
for (iter = 0;; iter++) {
|
||||
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||
|
||||
for (var query : queries) {
|
||||
while (query.hasMore()) {
|
||||
query.getMoreResults(buffer);
|
||||
sum1 += buffer.end;
|
||||
buffer.reset();
|
||||
}
|
||||
}
|
||||
|
||||
if ((iter % 100) == 0 && Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Warmup complete after " + iter + " iters with sum1 = " + sum1);
|
||||
|
||||
runEndTime = Instant.now().plus(runTime);
|
||||
Instant runStartTime = Instant.now();
|
||||
int sum2 = 0;
|
||||
List<Double> times = new ArrayList<>();
|
||||
for (iter = 0;; iter++) {
|
||||
List<IndexQuery> queries = indexReader.createQueries(new SearchTerms(searchParameters.query, searchParameters.compiledQueryIds), searchParameters.queryParams);
|
||||
indexReader.reset();
|
||||
List<IndexQuery> queries = indexReader.createQueries(searchContext);
|
||||
|
||||
long start = System.nanoTime();
|
||||
for (var query : queries) {
|
||||
@@ -316,12 +286,16 @@ public class PerfTestMain {
|
||||
}
|
||||
}
|
||||
long end = System.nanoTime();
|
||||
times.add((end - start)/1_000_000.);
|
||||
times.add((end - start)/1_000_000_000.);
|
||||
|
||||
if ((iter % 100) == 0) {
|
||||
if ((iter % 10) == 0) {
|
||||
if (Instant.now().isAfter(runEndTime)) {
|
||||
break;
|
||||
}
|
||||
if (times.size() > 100) {
|
||||
double[] timesSample = times.stream().mapToDouble(Double::doubleValue).skip(times.size() - 100).sorted().toArray();
|
||||
System.out.format("P1: %f P10: %f, P90: %f, P99: %f\n", timesSample[1], timesSample[10], timesSample[90], timesSample[99]);
|
||||
}
|
||||
System.out.println(Duration.between(runStartTime, Instant.now()).toMillis() / 1000. + " best times: " + times.stream().mapToDouble(Double::doubleValue).sorted().limit(3).average().orElse(-1));
|
||||
}
|
||||
}
|
||||
|
@@ -1,41 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:random-write-funnel')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
||||
implementation project(':third-party:parquet-floor')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.slop
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
@@ -1,69 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class FullIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
private final BTreeReader reader;
|
||||
|
||||
int pos;
|
||||
int endOffset;
|
||||
|
||||
final int entrySize;
|
||||
private final long wordId;
|
||||
|
||||
public FullIndexEntrySource(String name,
|
||||
BTreeReader reader,
|
||||
int entrySize,
|
||||
long wordId) {
|
||||
this.name = name;
|
||||
this.reader = reader;
|
||||
this.entrySize = entrySize;
|
||||
this.wordId = wordId;
|
||||
|
||||
pos = 0;
|
||||
endOffset = pos + entrySize * reader.numEntries();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(int n) {
|
||||
pos += n;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
buffer.reset();
|
||||
buffer.end = min(buffer.end, endOffset - pos);
|
||||
reader.readData(buffer.data, buffer.end, pos);
|
||||
pos += buffer.end;
|
||||
|
||||
destagger(buffer);
|
||||
buffer.uniq();
|
||||
}
|
||||
|
||||
private void destagger(LongQueryBuffer buffer) {
|
||||
if (entrySize == 1)
|
||||
return;
|
||||
|
||||
for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
|
||||
buffer.data.set(wi, buffer.data.get(ri));
|
||||
}
|
||||
|
||||
buffer.end /= entrySize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMore() {
|
||||
return pos < endOffset;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String indexName() {
|
||||
return name + ":" + Long.toHexString(wordId);
|
||||
}
|
||||
}
|
@@ -1,183 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
|
||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class FullReverseIndexReader {
|
||||
private final LongArray words;
|
||||
private final LongArray documents;
|
||||
private final long wordsDataOffset;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
private final String name;
|
||||
|
||||
private final PositionsFileReader positionsFileReader;
|
||||
|
||||
public FullReverseIndexReader(String name,
|
||||
Path words,
|
||||
Path documents,
|
||||
PositionsFileReader positionsFileReader) throws IOException {
|
||||
this.name = name;
|
||||
|
||||
this.positionsFileReader = positionsFileReader;
|
||||
|
||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||
this.words = null;
|
||||
this.documents = null;
|
||||
this.wordsBTreeReader = null;
|
||||
this.wordsDataOffset = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Switching reverse index");
|
||||
|
||||
this.words = LongArrayFactory.mmapForReadingShared(words);
|
||||
this.documents = LongArrayFactory.mmapForReadingShared(documents);
|
||||
|
||||
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
if (Boolean.getBoolean("index-self-test")) {
|
||||
Executors.newSingleThreadExecutor().execute(this::selfTest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void selfTest() {
|
||||
logger.info("Running self test program");
|
||||
|
||||
long wordsDataSize = wordsBTreeReader.getHeader().numEntries() * 2L;
|
||||
var wordsDataRange = words.range(wordsDataOffset, wordsDataOffset + wordsDataSize);
|
||||
|
||||
// ReverseIndexSelfTest.runSelfTest1(wordsDataRange, wordsDataSize);
|
||||
// ReverseIndexSelfTest.runSelfTest2(wordsDataRange, documents);
|
||||
// ReverseIndexSelfTest.runSelfTest3(wordsDataRange, wordsBTreeReader);
|
||||
// ReverseIndexSelfTest.runSelfTest4(wordsDataRange, documents);
|
||||
ReverseIndexSelfTest.runSelfTest5(wordsDataRange, wordsBTreeReader);
|
||||
ReverseIndexSelfTest.runSelfTest6(wordsDataRange, documents);
|
||||
}
|
||||
|
||||
|
||||
/** Calculate the offset of the word in the documents.
|
||||
* If the return-value is negative, the term does not exist
|
||||
* in the index.
|
||||
*/
|
||||
long wordOffset(long termId) {
|
||||
long idx = wordsBTreeReader.findEntry(termId);
|
||||
|
||||
if (idx < 0)
|
||||
return -1L;
|
||||
|
||||
return words.get(wordsDataOffset + idx + 1);
|
||||
}
|
||||
|
||||
public EntrySource documents(long termId) {
|
||||
if (null == words) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
|
||||
return new FullIndexEntrySource(name, createReaderNew(offset), 2, termId);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to exist in the documents */
|
||||
public QueryFilterStepIf also(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to be absent from the documents */
|
||||
public QueryFilterStepIf not(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterLetThrough();
|
||||
|
||||
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
||||
}
|
||||
|
||||
/** Return the number of documents with the termId in the index */
|
||||
public int numDocuments(long termId) {
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
|
||||
return createReaderNew(offset).numEntries();
|
||||
}
|
||||
|
||||
/** Create a BTreeReader for the document offset associated with a termId */
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(
|
||||
documents,
|
||||
ReverseIndexParameters.fullDocsBTreeContext,
|
||||
offset);
|
||||
}
|
||||
|
||||
public TermData[] getTermData(Arena arena,
|
||||
long termId,
|
||||
long[] docIds)
|
||||
{
|
||||
var ret = new TermData[docIds.length];
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) {
|
||||
// This is likely a bug in the code, but we can't throw an exception here
|
||||
logger.debug("Missing offset for word {}", termId);
|
||||
return ret;
|
||||
}
|
||||
|
||||
var reader = createReaderNew(offset);
|
||||
|
||||
// Read the size and offset of the position data
|
||||
var offsets = reader.queryData(docIds, 1);
|
||||
|
||||
return positionsFileReader.getTermData(arena, offsets);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (documents != null)
|
||||
documents.close();
|
||||
|
||||
if (words != null)
|
||||
words.close();
|
||||
|
||||
if (positionsFileReader != null) {
|
||||
try {
|
||||
positionsFileReader.close();
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to close positions file reader", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -1,33 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReverseIndexFullFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case WORDS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-words.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-words.dat");
|
||||
};
|
||||
case DOCS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-docs.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-docs.dat");
|
||||
};
|
||||
case POSITIONS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-positions.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-positions.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT,
|
||||
}
|
||||
|
||||
public enum FileIdentifier {
|
||||
WORDS,
|
||||
DOCS,
|
||||
POSITIONS,
|
||||
}
|
||||
}
|
@@ -1,11 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexParameters
|
||||
{
|
||||
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||
}
|
@@ -1,28 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ReverseIndexPrioFileNames {
|
||||
public static Path resolve(Path basePath, FileIdentifier identifier, FileVersion version) {
|
||||
return switch (identifier) {
|
||||
case WORDS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-prio-words.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-prio-words.dat");
|
||||
};
|
||||
case DOCS -> switch (version) {
|
||||
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
|
||||
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
public enum FileVersion {
|
||||
CURRENT,
|
||||
NEXT
|
||||
}
|
||||
|
||||
public enum FileIdentifier {
|
||||
WORDS,
|
||||
DOCS,
|
||||
}
|
||||
}
|
@@ -1,109 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class ReverseIndexSelfTest {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexSelfTest.class);
|
||||
public static void runSelfTest1(LongArray wordsDataRange, long wordsDataSize) {
|
||||
logger.info("Starting test 1");
|
||||
|
||||
if (!wordsDataRange.isSortedN(2, 0, wordsDataSize))
|
||||
logger.error("Failed test 1: Words data is not sorted");
|
||||
else
|
||||
logger.info("Passed test 1");
|
||||
}
|
||||
|
||||
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 2");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
|
||||
if (!docRange.isSortedN(2, 0, header.numEntries() * 2L)) {
|
||||
logger.error("Failed test 2: numEntries={}, offset={}", header.numEntries(), header.dataOffsetLongs());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Passed test 2");
|
||||
}
|
||||
|
||||
public static void runSelfTest3(LongArray wordsDataRange, BTreeReader reader) {
|
||||
logger.info("Starting test 3");
|
||||
for (long i = 0; i < wordsDataRange.size(); i+=2) {
|
||||
if (reader.findEntry(wordsDataRange.get(i)) < 0) {
|
||||
logger.error("Failed Test 3");
|
||||
return;
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 3");
|
||||
}
|
||||
|
||||
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 4");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
for (int j = 0; j < docRange.size(); j+=2) {
|
||||
if (docsBTreeReader.findEntry(docRange.get(j)) < 0) {
|
||||
logger.info("Failed test 4");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 4");
|
||||
}
|
||||
public static void runSelfTest5(LongArray wordsDataRange, BTreeReader wordsBTreeReader) {
|
||||
logger.info("Starting test 5");
|
||||
LongOpenHashSet words = new LongOpenHashSet((int)wordsDataRange.size()/2);
|
||||
for (int i = 0; i < wordsDataRange.size(); i+=2) {
|
||||
words.add(wordsDataRange.get(i));
|
||||
}
|
||||
var random = new Random();
|
||||
for (int i = 0; i < 100_000_000; i++) {
|
||||
long v;
|
||||
do {
|
||||
v = random.nextLong();
|
||||
} while (words.contains(v));
|
||||
if (wordsBTreeReader.findEntry(v) >= 0) {
|
||||
logger.error("Failed test 5 @ W{}", v);
|
||||
return;
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 5");
|
||||
}
|
||||
|
||||
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 6");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
Long prev = null;
|
||||
for (int j = 0; j < docRange.size(); j+=2) {
|
||||
if (prev == null) {
|
||||
prev = docRange.get(j);
|
||||
continue;
|
||||
}
|
||||
long thisVal = prev + 1;
|
||||
long nextVal = docRange.get(j);
|
||||
while (thisVal < nextVal) {
|
||||
if (docsBTreeReader.findEntry(thisVal) >= 0) {
|
||||
logger.info("Failed test 6 @ W{}:D{}", wordsDataRange.get(i-1), thisVal);
|
||||
return;
|
||||
}
|
||||
thisVal++;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 6");
|
||||
}
|
||||
}
|
@@ -1,76 +0,0 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.index.positions.PositionCodec;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** A class for constructing a positions file. This class is thread-safe.
|
||||
*
|
||||
* <p></p>
|
||||
*
|
||||
* The positions data is concatenated in the file, with each term's metadata
|
||||
* followed by its positions. The metadata is a single byte, and the positions
|
||||
* are encoded using the Elias Gamma code, with zero padded bits at the end to
|
||||
* get octet alignment.
|
||||
*
|
||||
* <p></p>
|
||||
*
|
||||
* It is the responsibility of the caller to keep track of the byte offset of
|
||||
* each posting in the file.
|
||||
*/
|
||||
public class PositionsFileConstructor implements AutoCloseable {
|
||||
private final ByteBuffer workBuffer = ByteBuffer.allocate(65536);
|
||||
|
||||
private final Path file;
|
||||
private final FileChannel channel;
|
||||
|
||||
private long offset;
|
||||
|
||||
public PositionsFileConstructor(Path file) throws IOException {
|
||||
this.file = file;
|
||||
|
||||
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
|
||||
}
|
||||
|
||||
/** Add a term to the positions file
|
||||
* @param termMeta the term metadata
|
||||
* @param positionsBuffer the positions of the term
|
||||
* @return the offset of the term in the file, with the size of the data in the highest byte
|
||||
*/
|
||||
public long add(byte termMeta, ByteBuffer positionsBuffer) throws IOException {
|
||||
synchronized (file) {
|
||||
int size = 1 + positionsBuffer.remaining();
|
||||
|
||||
if (workBuffer.remaining() < size) {
|
||||
workBuffer.flip();
|
||||
channel.write(workBuffer);
|
||||
workBuffer.clear();
|
||||
}
|
||||
|
||||
workBuffer.put(termMeta);
|
||||
workBuffer.put(positionsBuffer);
|
||||
|
||||
long ret = PositionCodec.encode(size, offset);
|
||||
|
||||
offset += size;
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (workBuffer.hasRemaining()) {
|
||||
workBuffer.flip();
|
||||
|
||||
while (workBuffer.hasRemaining())
|
||||
channel.write(workBuffer);
|
||||
}
|
||||
|
||||
channel.force(false);
|
||||
channel.close();
|
||||
}
|
||||
}
|
@@ -1,46 +0,0 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Constructs the BTrees in a reverse index */
|
||||
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
private final BTreeWriter writer;
|
||||
private final int entrySize;
|
||||
private final LongArray documentsArray;
|
||||
|
||||
long start = 0;
|
||||
long writeOffset = 0;
|
||||
|
||||
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
||||
int entrySize,
|
||||
BTreeContext bTreeContext,
|
||||
LongArray documentsArray) {
|
||||
this.documentsArray = documentsArray;
|
||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||
this.entrySize = entrySize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long transform(long pos, long end) throws IOException {
|
||||
|
||||
final int size = (int) ((end - start) / entrySize);
|
||||
|
||||
if (size == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final long offsetForBlock = writeOffset;
|
||||
|
||||
writeOffset += writer.write(writeOffset, size,
|
||||
mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
|
||||
);
|
||||
|
||||
start = end;
|
||||
return offsetForBlock;
|
||||
}
|
||||
}
|
@@ -1,88 +0,0 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
/** Reads positions data from the positions file */
|
||||
public class PositionsFileReader implements AutoCloseable {
|
||||
|
||||
// We use multiple file channels to avoid reads becoming serialized by the kernel.
|
||||
// If we don't do this, multi-threaded reads become strictly slower than single-threaded reads
|
||||
// (which is why AsynchronousFileChannel sucks).
|
||||
|
||||
// This is likely the best option apart from O_DIRECT or FFI:ing in libaio or io_uring.
|
||||
|
||||
private final FileChannel[] positions;
|
||||
private final ForkJoinPool forkJoinPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
|
||||
|
||||
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||
this(positionsFile, 8);
|
||||
}
|
||||
|
||||
public PositionsFileReader(Path positionsFile, int nreaders) throws IOException {
|
||||
positions = new FileChannel[nreaders];
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
positions[i] = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||
}
|
||||
forkJoinPool = new ForkJoinPool(nreaders);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
for (FileChannel fc : positions) {
|
||||
fc.close();
|
||||
}
|
||||
forkJoinPool.close();
|
||||
}
|
||||
|
||||
/** Get the positions for a keywords in the index, as pointed out by the encoded offsets;
|
||||
* intermediate buffers are allocated from the provided arena allocator. */
|
||||
public TermData[] getTermData(Arena arena, long[] offsets) {
|
||||
TermData[] ret = new TermData[offsets.length];
|
||||
|
||||
int tasks = 0;
|
||||
for (long l : offsets) if (l != 0) tasks++;
|
||||
|
||||
CountDownLatch cl = new CountDownLatch(tasks);
|
||||
|
||||
for (int i = 0; i < offsets.length; i++) {
|
||||
long encodedOffset = offsets[i];
|
||||
if (encodedOffset == 0) continue;
|
||||
|
||||
int idx = i;
|
||||
int length = PositionCodec.decodeSize(encodedOffset);
|
||||
long offset = PositionCodec.decodeOffset(encodedOffset);
|
||||
ByteBuffer buffer = arena.allocate(length).asByteBuffer();
|
||||
|
||||
forkJoinPool.execute(() -> {
|
||||
try {
|
||||
positions[idx % positions.length].read(buffer, offset);
|
||||
ret[idx] = new TermData(buffer);
|
||||
cl.countDown();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read positions file", ex);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
cl.await();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@@ -1,28 +0,0 @@
|
||||
package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public record ReverseIndexRejectFilter(BTreeReader range) implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
range.rejectEntries(buffer);
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
public boolean test(long id) {
|
||||
return range.findEntry(id) < 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return range.numEntries();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "ReverseIndexRejectFilter[]";
|
||||
}
|
||||
}
|
@@ -1,28 +0,0 @@
|
||||
package nu.marginalia.index.query;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
|
||||
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
range.retainEntries(buffer);
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
public boolean test(long id) {
|
||||
return range.findEntry(id) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return range.numEntries();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Retain:" + name + "/" + wordId;
|
||||
}
|
||||
}
|
@@ -1,56 +0,0 @@
|
||||
# Reverse Index
|
||||
|
||||
The reverse index contains a mapping from word to document id.
|
||||
|
||||
There are two tiers of this index.
|
||||
|
||||
* A priority index which only indexes terms that are flagged with priority flags<sup>1</sup>.
|
||||
* A full index that indexes all terms.
|
||||
|
||||
The full index also provides access to term-level metadata, while the priority index is
|
||||
a binary index that only offers information about which documents has a specific word.
|
||||
|
||||
The priority index is also compressed, while the full index at this point is not.
|
||||
|
||||
[1] See WordFlags in [common/model](../../common/model/) and
|
||||
KeywordMetadata in [converting-process/ft-keyword-extraction](../../processes/converting-process/ft-keyword-extraction).
|
||||
|
||||
## Construction
|
||||
|
||||
The reverse index is constructed by first building a series of preindexes.
|
||||
Preindexes consist of a Segment and a Documents object. The segment contains
|
||||
information about which word identifiers are present and how many, and the
|
||||
documents contain information about in which documents the words can be found.
|
||||
|
||||

|
||||
|
||||
These would typically not fit in RAM, so the index journal is paged
|
||||
and the preindexes are constructed small enough to fit in memory, and
|
||||
then merged. Merging sorted arrays is a very fast operation that does
|
||||
not require additional RAM.
|
||||
|
||||

|
||||
|
||||
Once merged into one large preindex, indexes are added to the preindex data
|
||||
to form a finalized reverse index.
|
||||
|
||||

|
||||
## Central Classes
|
||||
|
||||
Full index:
|
||||
* [FullPreindex](java/nu/marginalia/index/construction/full/FullPreindex.java) intermediate reverse index state.
|
||||
* [FullIndexConstructor](java/nu/marginalia/index/construction/full/FullIndexConstructor.java) constructs the index.
|
||||
* [FullReverseIndexReader](java/nu/marginalia/index/FullReverseIndexReader.java) interrogates the index.
|
||||
|
||||
Prio index:
|
||||
* [PrioPreindex](java/nu/marginalia/index/construction/prio/PrioPreindex.java) intermediate reverse index state.
|
||||
* [PrioIndexConstructor](java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java) constructs the index.
|
||||
* [PrioIndexReader](java/nu/marginalia/index/PrioReverseIndexReader.java) interrogates the index.
|
||||
|
||||
|
||||
## See Also
|
||||
|
||||
* [index-journal](../index-journal)
|
||||
* [index-forward](../index-forward)
|
||||
* [libraries/btree](../../libraries/btree)
|
||||
* [libraries/array](../../libraries/array)
|
@@ -1,49 +0,0 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class ReverseIndexDebugTest {
|
||||
@Test
|
||||
@Disabled // this is a debugging utility
|
||||
public void debug() throws IOException {
|
||||
long problemWord = -7909917549851025932L;
|
||||
long problemDoc = 9079256848846028801L;
|
||||
|
||||
var words = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-words.dat"));
|
||||
var documents = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-docs.dat"));
|
||||
|
||||
var wordsBTreeReader = new BTreeReader(words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
var wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
long wordOffset = wordsBTreeReader.findEntry(problemWord);
|
||||
assertTrue(wordOffset >= 0);
|
||||
|
||||
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
|
||||
|
||||
// We find problemDoc even though it doesn't exist in the document range
|
||||
long docOffset = docsReader.findEntry(problemDoc);
|
||||
assertTrue(docOffset < 0);
|
||||
|
||||
// We know it doesn't exist because when we check, we can't find it,
|
||||
// either by iterating...
|
||||
var dataRange = docsReader.data();
|
||||
System.out.println(dataRange.size());
|
||||
for (int i = 0; i < dataRange.size(); i+=2) {
|
||||
|
||||
assertNotEquals(problemDoc, dataRange.get(i));
|
||||
}
|
||||
|
||||
// or by binary searching
|
||||
assertTrue(dataRange.binarySearchN(2, problemDoc, 0, dataRange.size()) < 0);
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -1,149 +0,0 @@
|
||||
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.model.BTreeHeader;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class FullPreindexFinalizeTest {
|
||||
TestJournalFactory journalFactory;
|
||||
Path positionsFile;
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
Path docsFile;
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
journalFactory = new TestJournalFactory();
|
||||
|
||||
positionsFile = Files.createTempFile("positions", ".dat");
|
||||
countsFile = Files.createTempFile("counts", ".dat");
|
||||
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||
docsFile = Files.createTempFile("docs", ".dat");
|
||||
tempDir = Files.createTempDirectory("sort");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
journalFactory.clear();
|
||||
|
||||
Files.deleteIfExists(countsFile);
|
||||
Files.deleteIfExists(wordsIdFile);
|
||||
List<Path> contents = new ArrayList<>();
|
||||
Files.list(tempDir).forEach(contents::add);
|
||||
for (var tempFile : contents) {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
long termId(String keyword) {
|
||||
return hash.hashKeyword(keyword);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFinalizeSimple() throws IOException {
|
||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
preindex.delete();
|
||||
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
||||
assertTrue(Files.exists(wordsFile));
|
||||
assertTrue(Files.exists(docsFile));
|
||||
|
||||
System.out.println(Files.size(wordsFile));
|
||||
System.out.println(Files.size(docsFile));
|
||||
|
||||
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
|
||||
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
|
||||
|
||||
var docsHeader = new BTreeHeader(docsArray, 0);
|
||||
var wordsHeader = new BTreeHeader(wordsArray, 0);
|
||||
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
assertEquals(1, wordsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFinalizeSimple2x2() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||
);
|
||||
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
preindex.delete();
|
||||
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
||||
assertTrue(Files.exists(wordsFile));
|
||||
assertTrue(Files.exists(docsFile));
|
||||
|
||||
System.out.println(Files.size(wordsFile));
|
||||
System.out.println(Files.size(docsFile));
|
||||
|
||||
var docsArray = LongArrayFactory.mmapForReadingConfined(docsFile);
|
||||
var wordsArray = LongArrayFactory.mmapForReadingConfined(wordsFile);
|
||||
|
||||
|
||||
var wordsHeader = new BTreeHeader(wordsArray, 0);
|
||||
|
||||
System.out.println(wordsHeader);
|
||||
|
||||
assertEquals(2, wordsHeader.numEntries());
|
||||
|
||||
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
|
||||
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
||||
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
|
||||
BTreeHeader docsHeader;
|
||||
|
||||
docsHeader = new BTreeHeader(docsArray, offset1);
|
||||
System.out.println(docsHeader);
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
|
||||
docsHeader = new BTreeHeader(docsArray, offset2);
|
||||
System.out.println(docsHeader);
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
}
|
||||
}
|
Before Width: | Height: | Size: 21 KiB After Width: | Height: | Size: 21 KiB |
@@ -1,22 +1,24 @@
|
||||
package nu.marginalia.index.index;
|
||||
package nu.marginalia.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.PrioReverseIndexReader;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.model.TermMetadataList;
|
||||
import nu.marginalia.index.reverse.FullReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.IndexLanguageContext;
|
||||
import nu.marginalia.index.reverse.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.query.IndexQuery;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.slf4j.Logger;
|
||||
@@ -28,6 +30,7 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** A reader for the combined forward and reverse indexes.
|
||||
@@ -51,25 +54,33 @@ public class CombinedIndexReader {
|
||||
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
||||
}
|
||||
|
||||
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
|
||||
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
|
||||
public IndexLanguageContext createLanguageContext(String languageIsoCode) {
|
||||
return new IndexLanguageContext(languageIsoCode,
|
||||
reverseIndexFullReader.getWordLexicon(languageIsoCode),
|
||||
reverseIndexPriorityReader.getWordLexicon(languageIsoCode)
|
||||
);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf hasWordFull(long termId) {
|
||||
return reverseIndexFullReader.also(termId);
|
||||
public IndexQueryBuilder newQueryBuilder(IndexLanguageContext context, IndexQuery query) {
|
||||
return new IndexQueryBuilder(reverseIndexFullReader, context, query);
|
||||
}
|
||||
|
||||
public QueryFilterStepIf hasWordFull(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
|
||||
return reverseIndexFullReader.also(languageContext, termId, budget);
|
||||
}
|
||||
|
||||
/** Creates a query builder for terms in the priority index */
|
||||
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
|
||||
.withSourceTerms(wordId);
|
||||
public IndexQueryBuilder findPriorityWord(IndexLanguageContext languageContext, long wordId) {
|
||||
IndexQuery query = new IndexQuery(reverseIndexPriorityReader.documents(languageContext, wordId), true);
|
||||
|
||||
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
|
||||
}
|
||||
|
||||
/** Creates a query builder for terms in the full index */
|
||||
public IndexQueryBuilder findFullWord(long wordId) {
|
||||
return newQueryBuilder(
|
||||
new IndexQuery(reverseIndexFullReader.documents(wordId)))
|
||||
.withSourceTerms(wordId);
|
||||
public IndexQueryBuilder findFullWord(IndexLanguageContext languageContext, long wordId) {
|
||||
IndexQuery query = new IndexQuery(reverseIndexFullReader.documents(languageContext, wordId), false);
|
||||
|
||||
return newQueryBuilder(languageContext, query).withSourceTerms(wordId);
|
||||
}
|
||||
|
||||
/** Creates a parameter matching filter step for the provided parameters */
|
||||
@@ -78,21 +89,32 @@ public class CombinedIndexReader {
|
||||
}
|
||||
|
||||
/** Returns the number of occurrences of the word in the full index */
|
||||
public int numHits(long word) {
|
||||
return reverseIndexFullReader.numDocuments(word);
|
||||
public int numHits(IndexLanguageContext languageContext, long word) {
|
||||
return reverseIndexFullReader.numDocuments(languageContext, word);
|
||||
}
|
||||
|
||||
public List<IndexQuery> createQueries(SearchTerms terms, QueryParams params) {
|
||||
/** Reset caches and buffers */
|
||||
public void reset() {
|
||||
reverseIndexFullReader.reset();
|
||||
}
|
||||
|
||||
public List<IndexQuery> createQueries(SearchContext context) {
|
||||
|
||||
if (!isLoaded()) {
|
||||
logger.warn("Index reader not ready");
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||
final IndexLanguageContext languageContext = context.languageContext;
|
||||
final long[] termPriority = context.sortedDistinctIncludes((a,b) -> {
|
||||
return Long.compare(
|
||||
numHits(languageContext, a),
|
||||
numHits(languageContext, b)
|
||||
);
|
||||
});
|
||||
|
||||
final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords);
|
||||
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery());
|
||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||
List<LongSet> paths = CompiledQueryAggregates.queriesAggregate(context.compiledQueryIds);
|
||||
|
||||
// Remove any paths that do not contain all prioritized terms, as this means
|
||||
// the term is missing from the index and can never be found
|
||||
@@ -102,37 +124,27 @@ public class CombinedIndexReader {
|
||||
LongList elements = new LongArrayList(path);
|
||||
|
||||
elements.sort((a, b) -> {
|
||||
for (int i = 0; i < termPriority.length; i++) {
|
||||
if (termPriority[i] == a)
|
||||
for (long l : termPriority) {
|
||||
if (l == a)
|
||||
return -1;
|
||||
if (termPriority[i] == b)
|
||||
if (l == b)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
});
|
||||
|
||||
if (!SearchTerms.stopWords.contains(elements.getLong(0))) {
|
||||
var head = findFullWord(elements.getLong(0));
|
||||
var head = findFullWord(languageContext, elements.getLong(0));
|
||||
|
||||
for (int i = 1; i < elements.size(); i++) {
|
||||
long termId = elements.getLong(i);
|
||||
|
||||
// if a stop word is present in the query, skip the step of requiring it to be in the document,
|
||||
// we'll assume it's there and save IO
|
||||
if (SearchTerms.stopWords.contains(termId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
head.addInclusionFilter(hasWordFull(termId));
|
||||
}
|
||||
queryHeads.add(head);
|
||||
for (int i = 1; i < elements.size(); i++) {
|
||||
head.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
|
||||
}
|
||||
queryHeads.add(head);
|
||||
|
||||
// If there are few paths, we can afford to check the priority index as well
|
||||
if (paths.size() < 4) {
|
||||
var prioHead = findPriorityWord(elements.getLong(0));
|
||||
var prioHead = findPriorityWord(languageContext, elements.getLong(0));
|
||||
for (int i = 1; i < elements.size(); i++) {
|
||||
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
|
||||
prioHead.addInclusionFilter(hasWordFull(languageContext, elements.getLong(i), context.budget));
|
||||
}
|
||||
queryHeads.add(prioHead);
|
||||
}
|
||||
@@ -142,17 +154,17 @@ public class CombinedIndexReader {
|
||||
for (var query : queryHeads) {
|
||||
|
||||
// Advice terms are a special case, mandatory but not ranked, and exempt from re-writing
|
||||
for (long term : terms.advice()) {
|
||||
query = query.also(term);
|
||||
for (long term : context.termIdsAdvice) {
|
||||
query = query.also(term, context.budget);
|
||||
}
|
||||
|
||||
for (long term : terms.excludes()) {
|
||||
query = query.not(term);
|
||||
for (long term : context.termIdsExcludes) {
|
||||
query = query.not(term, context.budget);
|
||||
}
|
||||
|
||||
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
|
||||
// items in the buffer
|
||||
query.addInclusionFilter(filterForParams(params));
|
||||
query.addInclusionFilter(filterForParams(context.queryParams));
|
||||
}
|
||||
|
||||
return queryHeads
|
||||
@@ -166,23 +178,20 @@ public class CombinedIndexReader {
|
||||
return permittedTerms::containsAll;
|
||||
}
|
||||
|
||||
private int compareKeywords(long a, long b) {
|
||||
return Long.compare(
|
||||
numHits(a),
|
||||
numHits(b)
|
||||
);
|
||||
}
|
||||
/** Returns the number of occurrences of the word in the priority index */
|
||||
public int numHitsPrio(long word) {
|
||||
return reverseIndexPriorityReader.numDocuments(word);
|
||||
public int numHitsPrio(IndexLanguageContext languageContext, long word) {
|
||||
return reverseIndexPriorityReader.numDocuments(languageContext, word);
|
||||
}
|
||||
|
||||
/** Retrieves the term metadata for the specified word for the provided documents */
|
||||
public TermMetadataList getTermMetadata(Arena arena,
|
||||
long wordId,
|
||||
CombinedDocIdList docIds)
|
||||
public TermMetadataList[] getTermMetadata(Arena arena,
|
||||
IndexLanguageContext languageContext,
|
||||
IndexSearchBudget budget,
|
||||
long[] wordIds,
|
||||
CombinedDocIdList docIds)
|
||||
throws TimeoutException
|
||||
{
|
||||
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
|
||||
return reverseIndexFullReader.getTermData(arena, languageContext, budget, wordIds, docIds);
|
||||
}
|
||||
|
||||
/** Retrieves the document metadata for the specified document */
|
||||
@@ -206,13 +215,8 @@ public class CombinedIndexReader {
|
||||
}
|
||||
|
||||
/** Retrieves the document spans for the specified documents */
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, CombinedDocIdList docIds) {
|
||||
long[] decodedIDs = docIds.array();
|
||||
for (int i = 0; i < decodedIDs.length; i++) {
|
||||
decodedIDs[i] = UrlIdCodec.removeRank(decodedIDs[i]);
|
||||
}
|
||||
|
||||
return forwardIndexReader.getDocumentSpans(arena, decodedIDs);
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList docIds) throws TimeoutException {
|
||||
return forwardIndexReader.getDocumentSpans(arena, budget, docIds);
|
||||
}
|
||||
|
||||
/** Close the indexes (this is not done immediately)
|
||||
@@ -263,6 +267,23 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void apply(LongQueryBuffer buffer) {
|
||||
if (!imposesMetaConstraint && !params.searchSet().imposesConstraint()) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (buffer.hasMore()) {
|
||||
if (test(buffer.currentValue())) {
|
||||
buffer.retainAndAdvance();
|
||||
}
|
||||
else {
|
||||
buffer.rejectAndAdvance();
|
||||
}
|
||||
}
|
||||
|
||||
buffer.finalizeFiltering();
|
||||
}
|
||||
|
||||
public boolean test(long combinedId) {
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
int domainId = UrlIdCodec.getDomainId(docId);
|
||||
@@ -348,4 +369,5 @@ class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
public String describe() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
}
|
@@ -3,16 +3,18 @@ package nu.marginalia.index;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.config.IndexFileName;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.reverse.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.reverse.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.searchset.DomainRankings;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.index.CreateIndexRequest;
|
||||
import nu.marginalia.mqapi.index.IndexName;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
@@ -25,11 +27,9 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX;
|
||||
|
||||
public class IndexConstructorMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
@@ -37,7 +37,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
static void main(String[] args) throws Exception {
|
||||
Instructions<CreateIndexRequest> instructions = null;
|
||||
try {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
@@ -74,20 +74,20 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
ProcessConfiguration processConfiguration,
|
||||
DomainRankings domainRankings) {
|
||||
|
||||
super(messageQueueFactory, processConfiguration, GsonFactory.get(), INDEX_CONSTRUCTOR_INBOX);
|
||||
super(messageQueueFactory, processConfiguration, GsonFactory.get(), ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX);
|
||||
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.heartbeat = heartbeat;
|
||||
this.domainRankings = domainRankings;
|
||||
}
|
||||
|
||||
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
|
||||
private void run(CreateIndexRequest instructions) throws IOException {
|
||||
heartbeat.start();
|
||||
|
||||
switch (instructions.indexName()) {
|
||||
case FORWARD -> createForwardIndex();
|
||||
case REVERSE_FULL -> createFullReverseIndex();
|
||||
case REVERSE_PRIO -> createPrioReverseIndex();
|
||||
case IndexName.FORWARD -> createForwardIndex();
|
||||
case IndexName.REVERSE_FULL -> createFullReverseIndex();
|
||||
case IndexName.REVERSE_PRIO -> createPrioReverseIndex();
|
||||
}
|
||||
|
||||
heartbeat.shutDown();
|
||||
@@ -95,50 +95,74 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
|
||||
private void createFullReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocs = findNextFile(new IndexFileName.FullDocs());
|
||||
Path outputFilePositions = findNextFile(new IndexFileName.FullPositions());
|
||||
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
Files.deleteIfExists(outputFilePositions);
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
|
||||
.map(IndexJournal::languages)
|
||||
.orElseGet(Set::of);
|
||||
|
||||
constructor.createReverseIndex(heartbeat, "createReverseIndexFull", workDir);
|
||||
for (String languageIsoCode : languageIsoCodes) {
|
||||
Path outputFileWords = findNextFile(new IndexFileName.FullWords(languageIsoCode));
|
||||
|
||||
FullIndexConstructor constructor = new FullIndexConstructor(
|
||||
languageIsoCode,
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
String processName = "createReverseIndexFull[%s]".formatted(languageIsoCode);
|
||||
|
||||
constructor.createReverseIndex(heartbeat, processName, workDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void createPrioReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocs = findNextFile(new IndexFileName.PrioDocs());
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
var constructor = new PrioIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
Set<String> languageIsoCodes = IndexJournal.findJournal(workDir)
|
||||
.map(IndexJournal::languages)
|
||||
.orElseGet(Set::of);
|
||||
|
||||
constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir);
|
||||
for (String languageIsoCode : languageIsoCodes) {
|
||||
Path outputFileWords = findNextFile(new IndexFileName.PrioWords(languageIsoCode));
|
||||
Files.deleteIfExists(outputFileWords);
|
||||
|
||||
PrioIndexConstructor constructor = new PrioIndexConstructor(
|
||||
languageIsoCode,
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
String processName = "createReverseIndexPrio[%s]".formatted(languageIsoCode);
|
||||
|
||||
constructor.createReverseIndex(heartbeat, processName, workDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws IOException {
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
|
||||
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocsId = findNextFile(new IndexFileName.ForwardDocIds());
|
||||
Path outputFileDocsData = findNextFile(new IndexFileName.ForwardDocIds());
|
||||
Path outputFileSpansData = findNextFile(new IndexFileName.ForwardSpansData());
|
||||
|
||||
ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat,
|
||||
outputFileDocsId,
|
||||
@@ -151,6 +175,10 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
converter.convert();
|
||||
}
|
||||
|
||||
private Path findNextFile(IndexFileName fileName) {
|
||||
return IndexFileName.resolve(IndexLocations.getCurrentIndex(fileStorageService), fileName, IndexFileName.Version.NEXT);
|
||||
}
|
||||
|
||||
/** Append the domain's ranking to the high bits of a document ID
|
||||
* to ensure they're sorted in order of rank within the index.
|
||||
*/
|
@@ -4,7 +4,7 @@ import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.searchset.DomainRankings;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
|
||||
public class IndexConstructorModule extends AbstractModule {
|
@@ -3,27 +3,34 @@ package nu.marginalia.index;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.config.IndexFileName;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.reverse.FullReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.WordLexicon;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class IndexFactory {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final Path liveStorage;
|
||||
private final LanguageConfiguration languageConfiguration;
|
||||
|
||||
@Inject
|
||||
public IndexFactory(FileStorageService fileStorageService) {
|
||||
public IndexFactory(FileStorageService fileStorageService, LanguageConfiguration languageConfiguration) {
|
||||
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.liveStorage = IndexLocations.getCurrentIndex(fileStorageService);
|
||||
this.languageConfiguration = languageConfiguration;
|
||||
}
|
||||
|
||||
public CombinedIndexReader getCombinedIndexReader() throws IOException {
|
||||
@@ -39,47 +46,78 @@ public class IndexFactory {
|
||||
}
|
||||
|
||||
public FullReverseIndexReader getReverseIndexReader() throws IOException {
|
||||
|
||||
Path docsFile = getCurrentPath(new IndexFileName.FullDocs());
|
||||
Path positionsFile = getCurrentPath(new IndexFileName.FullPositions());
|
||||
|
||||
List<WordLexicon> wordLexicons = new ArrayList<>();
|
||||
|
||||
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
|
||||
String languageIsoCode = languageDefinition.isoCode();
|
||||
Path wordsFile = getCurrentPath(new IndexFileName.FullWords(languageIsoCode));
|
||||
if (Files.exists(wordsFile)) {
|
||||
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return new FullReverseIndexReader("full",
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
|
||||
wordLexicons,
|
||||
docsFile,
|
||||
positionsFile
|
||||
);
|
||||
}
|
||||
|
||||
public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
||||
return new PrioReverseIndexReader("prio",
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
||||
);
|
||||
|
||||
List<WordLexicon> wordLexicons = new ArrayList<>();
|
||||
|
||||
for (LanguageDefinition languageDefinition : languageConfiguration.languages()) {
|
||||
String languageIsoCode = languageDefinition.isoCode();
|
||||
Path wordsFile = getCurrentPath(new IndexFileName.PrioWords(languageIsoCode));
|
||||
if (Files.exists(wordsFile)) {
|
||||
wordLexicons.add(new WordLexicon(languageIsoCode, wordsFile));
|
||||
}
|
||||
}
|
||||
|
||||
Path docsFile = getCurrentPath(new IndexFileName.PrioDocs());
|
||||
|
||||
return new PrioReverseIndexReader("prio", wordLexicons, docsFile);
|
||||
}
|
||||
|
||||
public ForwardIndexReader getForwardIndexReader() throws IOException {
|
||||
return new ForwardIndexReader(
|
||||
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
|
||||
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
|
||||
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
|
||||
);
|
||||
Path docIdsFile = getCurrentPath(new IndexFileName.ForwardDocIds());
|
||||
Path docDataFile = getCurrentPath(new IndexFileName.ForwardDocData());
|
||||
Path spansFile = getCurrentPath(new IndexFileName.ForwardSpansData());
|
||||
|
||||
return new ForwardIndexReader(docIdsFile, docDataFile, spansFile);
|
||||
}
|
||||
|
||||
private Path getCurrentPath(IndexFileName fileName) {
|
||||
return IndexFileName.resolve(liveStorage, fileName, IndexFileName.Version.CURRENT);
|
||||
}
|
||||
|
||||
/** Switches the current index to the next index */
|
||||
public void switchFiles() throws IOException {
|
||||
|
||||
for (var file : ReverseIndexFullFileNames.FileIdentifier.values()) {
|
||||
for (var file : IndexFileName.forwardIndexFiles()) {
|
||||
switchFile(
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.NEXT),
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, file, ReverseIndexFullFileNames.FileVersion.CURRENT)
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
|
||||
);
|
||||
}
|
||||
for (var file : ReverseIndexPrioFileNames.FileIdentifier.values()) {
|
||||
|
||||
for (IndexFileName file : IndexFileName.revPrioIndexFiles(languageConfiguration)) {
|
||||
switchFile(
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.NEXT),
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, file, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
|
||||
);
|
||||
}
|
||||
for (var file : ForwardIndexFileNames.FileIdentifier.values()) {
|
||||
|
||||
for (IndexFileName file : IndexFileName.revFullIndexFiles(languageConfiguration)) {
|
||||
switchFile(
|
||||
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.NEXT),
|
||||
ForwardIndexFileNames.resolve(liveStorage, file, ForwardIndexFileNames.FileVersion.CURRENT)
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.NEXT),
|
||||
IndexFileName.resolve(liveStorage, file, IndexFileName.Version.CURRENT)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@@ -5,18 +5,19 @@ import com.google.inject.Singleton;
|
||||
import io.grpc.Status;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.index.searchset.SearchSetsService;
|
||||
import nu.marginalia.index.searchset.SmallSearchSet;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
@@ -24,7 +25,9 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@Singleton
|
||||
public class IndexGrpcService
|
||||
@@ -33,6 +36,7 @@ public class IndexGrpcService
|
||||
{
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Map<String, KeywordHasher> keywordHasherByLangIso;
|
||||
|
||||
// This marker is used to mark sensitive log messages that are related to queries
|
||||
// so that they can be filtered out in the production logging configuration
|
||||
@@ -43,11 +47,6 @@ public class IndexGrpcService
|
||||
.help("Query timeout counter")
|
||||
.labelNames("node", "api")
|
||||
.register();
|
||||
private static final Gauge wmsa_query_cost = Gauge.build()
|
||||
.name("wmsa_index_query_cost")
|
||||
.help("Computational cost of query")
|
||||
.labelNames("node", "api")
|
||||
.register();
|
||||
private static final Histogram wmsa_query_time = Histogram.build()
|
||||
.name("wmsa_index_query_time")
|
||||
.linearBuckets(0.05, 0.05, 15)
|
||||
@@ -55,60 +54,60 @@ public class IndexGrpcService
|
||||
.help("Index-side query time")
|
||||
.register();
|
||||
|
||||
private static final Gauge wmsa_index_query_exec_stall_time = Gauge.build()
|
||||
.name("wmsa_index_query_exec_stall_time")
|
||||
.help("Execution stall time")
|
||||
.labelNames("node")
|
||||
.register();
|
||||
|
||||
private static final Gauge wmsa_index_query_exec_block_time = Gauge.build()
|
||||
.name("wmsa_index_query_exec_block_time")
|
||||
.help("Execution stall time")
|
||||
.labelNames("node")
|
||||
.register();
|
||||
|
||||
private final StatefulIndex statefulIndex;
|
||||
private final SearchSetsService searchSetsService;
|
||||
|
||||
private final IndexResultRankingService rankingService;
|
||||
|
||||
private final String nodeName;
|
||||
|
||||
private final int nodeId;
|
||||
|
||||
@Inject
|
||||
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
||||
LanguageConfiguration languageConfiguration,
|
||||
StatefulIndex statefulIndex,
|
||||
SearchSetsService searchSetsService,
|
||||
IndexResultRankingService rankingService)
|
||||
{
|
||||
var nodeId = serviceConfiguration.node();
|
||||
this.nodeId = serviceConfiguration.node();
|
||||
this.nodeName = Integer.toString(nodeId);
|
||||
this.statefulIndex = statefulIndex;
|
||||
this.searchSetsService = searchSetsService;
|
||||
this.rankingService = rankingService;
|
||||
this.keywordHasherByLangIso = new HashMap<>();
|
||||
|
||||
for (LanguageDefinition definition : languageConfiguration.languages()) {
|
||||
keywordHasherByLangIso.put(definition.isoCode(), definition.keywordHasher());
|
||||
}
|
||||
}
|
||||
|
||||
// GRPC endpoint
|
||||
|
||||
public void query(RpcIndexQuery request,
|
||||
StreamObserver<RpcDecoratedResultItem> responseObserver) {
|
||||
|
||||
try {
|
||||
var params = new SearchParameters(request, getSearchSet(request));
|
||||
|
||||
long endTime = System.currentTimeMillis() + request.getQueryLimits().getTimeoutMs();
|
||||
KeywordHasher hasher = findHasher(request);
|
||||
|
||||
List<RpcDecoratedResultItem> results = wmsa_query_time
|
||||
.labels(nodeName, "GRPC")
|
||||
.time(() -> {
|
||||
// Perform the search
|
||||
try {
|
||||
|
||||
if (!statefulIndex.isLoaded()) {
|
||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return new IndexQueryExecution(params, rankingService, statefulIndex.get()).run();
|
||||
CombinedIndexReader indexReader = statefulIndex.get();
|
||||
|
||||
SearchContext rankingContext =
|
||||
SearchContext.create(indexReader, hasher, request, getSearchSet(request));
|
||||
|
||||
IndexQueryExecution queryExecution =
|
||||
new IndexQueryExecution(indexReader, rankingService, rankingContext, nodeId);
|
||||
|
||||
return queryExecution.run();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
@@ -116,11 +115,6 @@ public class IndexGrpcService
|
||||
}
|
||||
});
|
||||
|
||||
// Prometheus bookkeeping
|
||||
wmsa_query_cost
|
||||
.labels(nodeName, "GRPC")
|
||||
.set(params.getDataCost());
|
||||
|
||||
if (System.currentTimeMillis() >= endTime) {
|
||||
wmsa_query_timeouts
|
||||
.labels(nodeName, "GRPC")
|
||||
@@ -140,6 +134,21 @@ public class IndexGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
/** Keywords are translated to a numeric format via a 64 bit hash algorithm,
|
||||
* which varies depends on the language.
|
||||
*/
|
||||
private KeywordHasher findHasher(RpcIndexQuery request) {
|
||||
KeywordHasher hasher = keywordHasherByLangIso.get(request.getLangIsoCode());
|
||||
if (hasher != null)
|
||||
return hasher;
|
||||
|
||||
hasher = keywordHasherByLangIso.get("en");
|
||||
if (hasher != null)
|
||||
return hasher;
|
||||
|
||||
throw new IllegalStateException("Could not find fallback keyword hasher for iso code 'en'");
|
||||
}
|
||||
|
||||
|
||||
// exists for test access
|
||||
public List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
||||
@@ -149,7 +158,12 @@ public class IndexGrpcService
|
||||
return List.of();
|
||||
}
|
||||
|
||||
return new IndexQueryExecution(new SearchParameters(specsSet, getSearchSet(specsSet)), rankingService, statefulIndex.get()).run();
|
||||
CombinedIndexReader currentIndex = statefulIndex.get();
|
||||
|
||||
SearchContext context = SearchContext.create(currentIndex,
|
||||
keywordHasherByLangIso.get("en"), specsSet, getSearchSet(specsSet));
|
||||
|
||||
return new IndexQueryExecution(currentIndex, rankingService, context, 1).run();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
|
@@ -1,14 +1,14 @@
|
||||
package nu.marginalia.index.index;
|
||||
package nu.marginalia.index;
|
||||
|
||||
import java.util.List;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.reverse.FullReverseIndexReader;
|
||||
import nu.marginalia.index.reverse.IndexLanguageContext;
|
||||
import nu.marginalia.index.reverse.query.IndexQuery;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
|
||||
|
||||
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
public class IndexQueryBuilder {
|
||||
private final IndexLanguageContext context;
|
||||
private final IndexQuery query;
|
||||
private final FullReverseIndexReader reverseIndexFullReader;
|
||||
|
||||
@@ -20,8 +20,9 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
* */
|
||||
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
|
||||
|
||||
IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
|
||||
IndexQueryBuilder(FullReverseIndexReader reverseIndexFullReader, IndexLanguageContext context, IndexQuery query)
|
||||
{
|
||||
this.context = context;
|
||||
this.query = query;
|
||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||
}
|
||||
@@ -32,18 +33,18 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder also(long termId) {
|
||||
public IndexQueryBuilder also(long termId, IndexSearchBudget budget) {
|
||||
|
||||
if (alreadyConsideredTerms.add(termId)) {
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||
query.addInclusionFilter(reverseIndexFullReader.also(context, termId, budget));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder not(long termId) {
|
||||
public IndexQueryBuilder not(long termId, IndexSearchBudget budget) {
|
||||
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||
query.addInclusionFilter(reverseIndexFullReader.not(context, termId, budget));
|
||||
|
||||
return this;
|
||||
}
|
||||
@@ -55,20 +56,6 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder addInclusionFilterAny(List<QueryFilterStepIf> filterSteps) {
|
||||
if (filterSteps.isEmpty())
|
||||
return this;
|
||||
|
||||
if (filterSteps.size() == 1) {
|
||||
query.addInclusionFilter(filterSteps.getFirst());
|
||||
}
|
||||
else {
|
||||
query.addInclusionFilter(new QueryFilterAnyOf(filterSteps));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexQuery build() {
|
||||
return query;
|
||||
}
|
@@ -1,132 +1,247 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import io.prometheus.client.Gauge;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.results.IndexResultRankingService;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.reverse.query.IndexQuery;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.skiplist.SkipListConstants;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
/** Performs an index query */
|
||||
public class IndexQueryExecution {
|
||||
|
||||
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 16);
|
||||
private static final int indexValuationThreads = Integer.getInteger("index.valuationThreads", 8);
|
||||
private static final int indexPreparationThreads = Integer.getInteger("index.preparationThreads", 2);
|
||||
|
||||
private static final ForkJoinPool lookupPool = new ForkJoinPool(indexValuationThreads);
|
||||
private static final ForkJoinPool evaluationPool = new ForkJoinPool(indexValuationThreads);
|
||||
// Since most NVMe drives have a maximum read size of 128 KB, and most small reads are 512B
|
||||
// this should probably be 128*1024 / 512 = 256 to reduce queue depth and optimize tail latency
|
||||
private static final int evaluationBatchSize = 256;
|
||||
|
||||
// This should probably be SkipListConstants.BLOCK_SIZE / 16 in order to reduce the number of unnecessary read
|
||||
// operations per lookup and again optimize tail latency
|
||||
private static final int lookupBatchSize = SkipListConstants.BLOCK_SIZE / 16;
|
||||
|
||||
private static final ExecutorService threadPool =
|
||||
new ThreadPoolExecutor(indexValuationThreads, 256,
|
||||
60L, TimeUnit.SECONDS, new SynchronousQueue<>());
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(IndexQueryExecution.class);
|
||||
|
||||
private final String nodeName;
|
||||
private final IndexResultRankingService rankingService;
|
||||
|
||||
private final ResultRankingContext rankingContext;
|
||||
private final SearchContext rankingContext;
|
||||
private final List<IndexQuery> queries;
|
||||
private final IndexSearchBudget budget;
|
||||
private final ResultPriorityQueue resultHeap;
|
||||
private final CountDownLatch executionCountdown;
|
||||
|
||||
private final CountDownLatch lookupCountdown;
|
||||
private final CountDownLatch preparationCountdown;
|
||||
private final CountDownLatch rankingCountdown;
|
||||
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> fullPreparationQueue = new ArrayBlockingQueue<>(1);
|
||||
private final ArrayBlockingQueue<CombinedDocIdList> priorityPreparationQueue = new ArrayBlockingQueue<>(1);
|
||||
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> fullEvaluationQueue = new ArrayBlockingQueue<>(32);
|
||||
private final ArrayBlockingQueue<IndexResultRankingService.RankingData> priorityEvaluationQueue = new ArrayBlockingQueue<>(32);
|
||||
|
||||
private final int limitTotal;
|
||||
private final int limitByDomain;
|
||||
|
||||
private int evaluationJobCounter;
|
||||
private static final Gauge metric_index_lookup_time_s = Gauge.build()
|
||||
.labelNames("node")
|
||||
.name("index_exec_lookup_time_s")
|
||||
.help("Time in query spent on lookups")
|
||||
.register();
|
||||
|
||||
public IndexQueryExecution(SearchParameters params,
|
||||
private static final Gauge metric_index_prep_time_s = Gauge.build()
|
||||
.labelNames("node")
|
||||
.name("index_exec_prep_time_s")
|
||||
.help("Time in query spent retrieving positions and spans")
|
||||
.register();
|
||||
|
||||
private static final Gauge metric_index_rank_time_s = Gauge.build()
|
||||
.labelNames("node")
|
||||
.name("index_exec_ranking_time_s")
|
||||
.help("Time in query spent on ranking")
|
||||
.register();
|
||||
|
||||
private static final Gauge metric_index_documents_ranked = Gauge.build()
|
||||
.labelNames("node")
|
||||
.name("index_exec_documents_ranked")
|
||||
.help("Number of documents ranked")
|
||||
.register();
|
||||
|
||||
|
||||
|
||||
public IndexQueryExecution(CombinedIndexReader currentIndex,
|
||||
IndexResultRankingService rankingService,
|
||||
CombinedIndexReader currentIndex) {
|
||||
SearchContext rankingContext,
|
||||
int serviceNode) {
|
||||
this.nodeName = Integer.toString(serviceNode);
|
||||
this.rankingService = rankingService;
|
||||
this.rankingContext = rankingContext;
|
||||
|
||||
resultHeap = new ResultPriorityQueue(params.fetchSize);
|
||||
resultHeap = new ResultPriorityQueue(rankingContext.fetchSize);
|
||||
|
||||
budget = params.budget;
|
||||
limitByDomain = params.limitByDomain;
|
||||
limitTotal = params.limitTotal;
|
||||
budget = rankingContext.budget;
|
||||
limitByDomain = rankingContext.limitByDomain;
|
||||
limitTotal = rankingContext.limitTotal;
|
||||
|
||||
rankingContext = ResultRankingContext.create(currentIndex, params);
|
||||
queries = currentIndex.createQueries(new SearchTerms(params.query, params.compiledQueryIds), params.queryParams);
|
||||
executionCountdown = new CountDownLatch(queries.size());
|
||||
queries = currentIndex.createQueries(rankingContext);
|
||||
|
||||
evaluationJobCounter = 0;
|
||||
lookupCountdown = new CountDownLatch(queries.size());
|
||||
preparationCountdown = new CountDownLatch(indexPreparationThreads * 2);
|
||||
rankingCountdown = new CountDownLatch(indexValuationThreads * 2);
|
||||
}
|
||||
|
||||
public List<RpcDecoratedResultItem> run() throws InterruptedException, SQLException {
|
||||
// Spawn lookup tasks for each query
|
||||
for (IndexQuery query : queries) {
|
||||
lookupPool.execute(() -> lookup(query));
|
||||
threadPool.submit(() -> lookup(query));
|
||||
}
|
||||
|
||||
// Await lookup task termination (this guarantees we're no longer creating new evaluation tasks)
|
||||
executionCountdown.await();
|
||||
|
||||
// Await evaluation task termination
|
||||
synchronized (IndexQueryExecution.this) {
|
||||
while (evaluationJobCounter > 0) {
|
||||
IndexQueryExecution.this.wait(budget.timeLeft());
|
||||
}
|
||||
for (int i = 0; i < indexPreparationThreads; i++) {
|
||||
threadPool.submit(() -> prepare(priorityPreparationQueue, priorityEvaluationQueue));
|
||||
threadPool.submit(() -> prepare(fullPreparationQueue, fullEvaluationQueue));
|
||||
}
|
||||
|
||||
// Spawn lookup tasks for each query
|
||||
for (int i = 0; i < indexValuationThreads; i++) {
|
||||
threadPool.submit(() -> evaluate(priorityEvaluationQueue));
|
||||
threadPool.submit(() -> evaluate(fullEvaluationQueue));
|
||||
}
|
||||
|
||||
// Await lookup task termination
|
||||
lookupCountdown.await();
|
||||
preparationCountdown.await();
|
||||
rankingCountdown.await();
|
||||
|
||||
// Deallocate any leftover ranking data buffers
|
||||
for (var data : priorityEvaluationQueue) {
|
||||
data.close();
|
||||
}
|
||||
for (var data : fullEvaluationQueue) {
|
||||
data.close();
|
||||
}
|
||||
|
||||
metric_index_documents_ranked
|
||||
.labels(nodeName)
|
||||
.inc(1000. * resultHeap.getItemsProcessed() / budget.getLimitTime());
|
||||
|
||||
// Final result selection
|
||||
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap);
|
||||
return rankingService.selectBestResults(limitByDomain, limitTotal, rankingContext, resultHeap.toList());
|
||||
}
|
||||
|
||||
private void lookup(IndexQuery query) {
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(1024);
|
||||
private List<Future<?>> lookup(IndexQuery query) {
|
||||
final LongQueryBuffer buffer = new LongQueryBuffer(lookupBatchSize);
|
||||
List<Future<?>> evaluationJobs = new ArrayList<>();
|
||||
try {
|
||||
while (query.hasMore() && budget.hasTimeLeft()) {
|
||||
|
||||
buffer.reset();
|
||||
buffer.zero();
|
||||
|
||||
long st = System.nanoTime();
|
||||
query.getMoreResults(buffer);
|
||||
long et = System.nanoTime();
|
||||
metric_index_lookup_time_s
|
||||
.labels(nodeName)
|
||||
.inc((et - st)/1_000_000_000.);
|
||||
|
||||
if (buffer.isEmpty())
|
||||
continue;
|
||||
|
||||
CombinedDocIdList docIds = new CombinedDocIdList(buffer);
|
||||
var queue = query.isPrioritized() ? priorityPreparationQueue : fullPreparationQueue;
|
||||
|
||||
boolean stealWork = false;
|
||||
synchronized (IndexQueryExecution.this) {
|
||||
// Hold off on spawning new evaluation jobs if we have too many queued
|
||||
// to avoid backpressure, instead steal work into the lookup thread
|
||||
// in this scenario
|
||||
if (buffer.end <= evaluationBatchSize) {
|
||||
var docIds = new CombinedDocIdList(buffer);
|
||||
|
||||
if (evaluationJobCounter > indexValuationThreads * 8) {
|
||||
stealWork = true;
|
||||
}
|
||||
else {
|
||||
evaluationJobCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
if (stealWork) {
|
||||
resultHeap.addAll(rankingService.rankResults(rankingContext, docIds, false));
|
||||
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Spawn an evaluation task
|
||||
evaluationPool.execute(() -> evaluate(docIds));
|
||||
long[] bufferData = buffer.copyData();
|
||||
for (int start = 0; start < bufferData.length; start+= evaluationBatchSize) {
|
||||
|
||||
long[] slice = Arrays.copyOfRange(bufferData, start,
|
||||
Math.min(start + evaluationBatchSize, bufferData.length));
|
||||
|
||||
var docIds = new CombinedDocIdList(slice);
|
||||
|
||||
if (!queue.offer(docIds, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (RuntimeException | InterruptedException ex) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} finally {
|
||||
buffer.dispose();
|
||||
executionCountdown.countDown();
|
||||
lookupCountdown.countDown();
|
||||
}
|
||||
|
||||
return evaluationJobs;
|
||||
}
|
||||
|
||||
private void prepare(ArrayBlockingQueue<CombinedDocIdList> inputQueue, ArrayBlockingQueue<IndexResultRankingService.RankingData> outputQueue) {
|
||||
try {
|
||||
while (budget.hasTimeLeft() && (lookupCountdown.getCount() > 0 || !inputQueue.isEmpty())) {
|
||||
var docIds = inputQueue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
|
||||
if (docIds == null) continue;
|
||||
|
||||
long st = System.nanoTime();
|
||||
var preparedData = rankingService.prepareRankingData(rankingContext, docIds);
|
||||
long et = System.nanoTime();
|
||||
metric_index_prep_time_s
|
||||
.labels(nodeName)
|
||||
.inc((et - st)/1_000_000_000.);
|
||||
|
||||
if (!outputQueue.offer(preparedData, Math.max(1, budget.timeLeft()), TimeUnit.MILLISECONDS))
|
||||
preparedData.close();
|
||||
}
|
||||
} catch (TimeoutException ex) {
|
||||
// This is normal
|
||||
} catch (Exception ex) {
|
||||
if (!(ex.getCause() instanceof InterruptedException)) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} // suppress logging for interrupted ex
|
||||
} finally {
|
||||
preparationCountdown.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
private void evaluate(CombinedDocIdList docIds) {
|
||||
private void evaluate(ArrayBlockingQueue<IndexResultRankingService.RankingData> queue) {
|
||||
try {
|
||||
if (!budget.hasTimeLeft())
|
||||
return;
|
||||
resultHeap.addAll(rankingService.rankResults(rankingContext, docIds, false));
|
||||
} finally {
|
||||
synchronized (IndexQueryExecution.this) {
|
||||
if (--evaluationJobCounter == 0) {
|
||||
IndexQueryExecution.this.notifyAll();
|
||||
while (budget.hasTimeLeft() && (preparationCountdown.getCount() > 0 || !queue.isEmpty())) {
|
||||
var rankingData = queue.poll(Math.clamp(budget.timeLeft(), 1, 5), TimeUnit.MILLISECONDS);
|
||||
if (rankingData == null) continue;
|
||||
|
||||
try (rankingData) {
|
||||
long st = System.nanoTime();
|
||||
resultHeap.addAll(rankingService.rankResults(rankingContext, rankingData, false));
|
||||
long et = System.nanoTime();
|
||||
|
||||
metric_index_rank_time_s
|
||||
.labels(nodeName)
|
||||
.inc((et - st)/1_000_000_000.);
|
||||
}
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
if (!(ex.getCause() instanceof InterruptedException)) {
|
||||
log.error("Exception in lookup thread", ex);
|
||||
} // suppress logging for interrupted ex
|
||||
} finally {
|
||||
rankingCountdown.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -5,9 +5,7 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.*;
|
||||
|
||||
/** A priority queue for search results. This class is not thread-safe,
|
||||
* in general, except for concurrent use of the addAll method.
|
||||
@@ -27,7 +25,7 @@ public class ResultPriorityQueue implements Iterable<SearchResultItem> {
|
||||
this.queue = MinMaxPriorityQueue.<SearchResultItem>orderedBy(Comparator.naturalOrder()).maximumSize(limit).create();
|
||||
}
|
||||
|
||||
public Iterator<SearchResultItem> iterator() {
|
||||
public @NotNull Iterator<SearchResultItem> iterator() {
|
||||
return queue.iterator();
|
||||
}
|
||||
|
||||
@@ -46,6 +44,10 @@ public class ResultPriorityQueue implements Iterable<SearchResultItem> {
|
||||
return true;
|
||||
}
|
||||
|
||||
public synchronized List<SearchResultItem> toList() {
|
||||
return new ArrayList<>(queue);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return queue.size();
|
||||
}
|
||||
|
@@ -1,8 +1,7 @@
|
||||
package nu.marginalia.index.index;
|
||||
package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.index.IndexFactory;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.forward;
|
||||
package nu.marginalia.index.config;
|
||||
|
||||
public class ForwardIndexParameters {
|
||||
public static final int ENTRY_SIZE = 3;
|
@@ -0,0 +1,97 @@
|
||||
package nu.marginalia.index.config;
|
||||
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public sealed interface IndexFileName {
|
||||
enum Version {
|
||||
CURRENT, NEXT
|
||||
}
|
||||
|
||||
record FullWords(String languageIsoCode) implements IndexFileName {}
|
||||
record FullDocs() implements IndexFileName {}
|
||||
record FullPositions() implements IndexFileName {}
|
||||
|
||||
record PrioWords(String languageIsoCode) implements IndexFileName {}
|
||||
record PrioDocs() implements IndexFileName {}
|
||||
|
||||
record ForwardDocIds() implements IndexFileName { }
|
||||
record ForwardDocData() implements IndexFileName { }
|
||||
record ForwardSpansData() implements IndexFileName { }
|
||||
|
||||
static List<IndexFileName> revFullIndexFiles(LanguageConfiguration languageConfiguration) {
|
||||
List<IndexFileName> ret = new ArrayList<>();
|
||||
|
||||
ret.add(new FullDocs());
|
||||
ret.add(new FullPositions());
|
||||
|
||||
for (LanguageDefinition ld : languageConfiguration.languages()) {
|
||||
ret.add(new FullWords(ld.isoCode()));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static List<IndexFileName> revPrioIndexFiles(LanguageConfiguration languageConfiguration) {
|
||||
List<IndexFileName> ret = new ArrayList<>();
|
||||
|
||||
ret.add(new PrioDocs());
|
||||
|
||||
for (LanguageDefinition ld : languageConfiguration.languages()) {
|
||||
ret.add(new PrioWords(ld.isoCode()));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static List<IndexFileName> forwardIndexFiles() {
|
||||
return List.of(
|
||||
new ForwardDocData(),
|
||||
new ForwardDocIds(),
|
||||
new ForwardSpansData()
|
||||
);
|
||||
}
|
||||
|
||||
static Path resolve(Path basePath, IndexFileName fileName, Version version) {
|
||||
return switch (fileName) {
|
||||
case FullWords(String isoCode) -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("rev-words-%s.dat".formatted(isoCode));
|
||||
case NEXT -> basePath.resolve("rev-words-%s.dat.next".formatted(isoCode));
|
||||
};
|
||||
case FullDocs() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("rev-docs.dat");
|
||||
case NEXT -> basePath.resolve("rev-docs.dat.next");
|
||||
};
|
||||
case FullPositions() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("rev-positions.dat");
|
||||
case NEXT -> basePath.resolve("rev-positions.dat.next");
|
||||
};
|
||||
case PrioWords(String languageIsoCode) -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("rev-prio-words-%s.dat".formatted(languageIsoCode));
|
||||
case NEXT -> basePath.resolve("rev-prio-words-%s.dat.next".formatted(languageIsoCode));
|
||||
};
|
||||
case PrioDocs() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("rev-prio-docs.dat");
|
||||
case NEXT -> basePath.resolve("rev-prio-docs.dat.next");
|
||||
};
|
||||
case ForwardDocIds() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("fwd-doc-ids.dat");
|
||||
case NEXT -> basePath.resolve("fwd-doc-ids.dat.next");
|
||||
};
|
||||
case ForwardDocData() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("fwd-doc-data.dat");
|
||||
case NEXT -> basePath.resolve("fwd-doc-data.dat.next");
|
||||
};
|
||||
case ForwardSpansData() -> switch (version) {
|
||||
case CURRENT -> basePath.resolve("fwd-spans.dat");
|
||||
case NEXT -> basePath.resolve("fwd-spans.dat.next");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,9 @@
|
||||
package nu.marginalia.index.config;
|
||||
|
||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexParameters
|
||||
{
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
|
||||
}
|
@@ -3,8 +3,11 @@ package nu.marginalia.index.forward;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -13,8 +16,9 @@ import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
||||
import static nu.marginalia.index.config.ForwardIndexParameters.*;
|
||||
|
||||
/** Reads the forward index.
|
||||
* <p/>
|
||||
@@ -65,6 +69,9 @@ public class ForwardIndexReader {
|
||||
ids = loadIds(idsFile);
|
||||
data = loadData(dataFile);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(data.getMemorySegment());
|
||||
LinuxSystemCalls.madviseRandom(ids.getMemorySegment());
|
||||
|
||||
spansReader = IndexSpansReader.open(spansFile);
|
||||
|
||||
Thread.ofPlatform().start(this::createIdsMap);
|
||||
@@ -76,6 +83,7 @@ public class ForwardIndexReader {
|
||||
idsMap.put(ids.get(i), i);
|
||||
}
|
||||
this.idsMap = idsMap;
|
||||
logger.info("Forward index loaded into RAM");
|
||||
}
|
||||
|
||||
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||
@@ -121,7 +129,7 @@ public class ForwardIndexReader {
|
||||
return idsMap.getOrDefault(docId, -1);
|
||||
}
|
||||
|
||||
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||
long offset = ids.binarySearch2(docId, 0, ids.size());
|
||||
|
||||
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
@@ -133,26 +141,12 @@ public class ForwardIndexReader {
|
||||
return (int) offset;
|
||||
}
|
||||
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return new DocumentSpans();
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, IndexSearchBudget budget, CombinedDocIdList combinedIds) throws TimeoutException {
|
||||
long[] offsets = new long[combinedIds.size()];
|
||||
|
||||
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||
|
||||
try {
|
||||
return spansReader.readSpans(arena, encodedOffset);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read spans for doc " + docId, ex);
|
||||
return new DocumentSpans();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
|
||||
long[] offsets = new long[docIds.length];
|
||||
for (int i = 0; i < docIds.length; i++) {
|
||||
long offset = idxForDoc(docIds[i]);
|
||||
for (int i = 0; i < offsets.length; i++) {
|
||||
long docId = UrlIdCodec.removeRank(combinedIds.at(i));
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset >= 0) {
|
||||
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||
}
|
||||
@@ -162,11 +156,11 @@ public class ForwardIndexReader {
|
||||
}
|
||||
|
||||
try {
|
||||
return spansReader.readSpans(arena, offsets);
|
||||
return spansReader.readSpans(arena, budget, offsets);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read spans for docIds", ex);
|
||||
return new DocumentSpans[docIds.length];
|
||||
return new DocumentSpans[offsets.length];
|
||||
}
|
||||
}
|
||||
|
@@ -3,10 +3,10 @@ package nu.marginalia.index.forward.construction;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||
import nu.marginalia.index.config.ForwardIndexParameters;
|
||||
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.searchset.DomainRankings;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
@@ -1,19 +1,18 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
public interface IndexSpansReader extends AutoCloseable {
|
||||
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
|
||||
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
|
||||
DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException, IOException;
|
||||
|
||||
static IndexSpansReader open(Path fileName) throws IOException {
|
||||
int version = SpansCodec.parseSpanFilesFooter(fileName);
|
||||
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
|
||||
return new IndexSpansReaderCompressed(fileName);
|
||||
}
|
||||
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
|
||||
if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
|
||||
return new IndexSpansReaderPlain(fileName);
|
||||
}
|
||||
else {
|
@@ -0,0 +1,100 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.uring.UringFileReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
public class IndexSpansReaderPlain implements IndexSpansReader {
|
||||
private final UringFileReader uringReader;
|
||||
|
||||
public IndexSpansReaderPlain(Path spansFile) throws IOException {
|
||||
if (Boolean.getBoolean("index.directModePositionsSpans")) {
|
||||
if ((Files.size(spansFile) & 4095) != 0) {
|
||||
throw new IllegalArgumentException("Spans file is not block aligned in size: " + Files.size(spansFile));
|
||||
}
|
||||
|
||||
uringReader = new UringFileReader(spansFile, true);
|
||||
}
|
||||
else {
|
||||
uringReader = new UringFileReader(spansFile, false);
|
||||
uringReader.fadviseWillneed();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSpans[] readSpans(Arena arena, IndexSearchBudget budget, long[] encodedOffsets) throws TimeoutException {
|
||||
|
||||
int readCnt = 0;
|
||||
for (long offset : encodedOffsets) {
|
||||
if (offset < 0) continue;
|
||||
readCnt ++;
|
||||
}
|
||||
|
||||
if (readCnt == 0) {
|
||||
return new DocumentSpans[encodedOffsets.length];
|
||||
}
|
||||
|
||||
long[] offsets = new long[readCnt];
|
||||
int[] sizes = new int[readCnt];
|
||||
|
||||
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
|
||||
if (encodedOffsets[idx] < 0)
|
||||
continue;
|
||||
long offset = encodedOffsets[idx];
|
||||
|
||||
offsets[j] = SpansCodec.decodeStartOffset(offset);
|
||||
sizes[j] = SpansCodec.decodeSize(offset);
|
||||
j++;
|
||||
}
|
||||
|
||||
List<MemorySegment> buffers = uringReader.readUnaligned(arena, budget.timeLeft(), offsets, sizes, 4096);
|
||||
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||
|
||||
for (int idx = 0, j = 0; idx < encodedOffsets.length; idx++) {
|
||||
if (encodedOffsets[idx] < 0)
|
||||
continue;
|
||||
ret[idx] = decode(buffers.get(j++));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public DocumentSpans decode(MemorySegment ms) {
|
||||
int count = ms.get(ValueLayout.JAVA_INT, 0);
|
||||
int pos = 4;
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
// Decode each span
|
||||
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
|
||||
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
|
||||
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
|
||||
|
||||
IntArrayList values = new IntArrayList(len);
|
||||
|
||||
pos += 4;
|
||||
for (int i = 0; i < len; i++) {
|
||||
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
|
||||
}
|
||||
ret.accept(code, values);
|
||||
pos += 4*len;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
uringReader.close();
|
||||
}
|
||||
|
||||
}
|
@@ -1,6 +1,8 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
@@ -12,7 +14,9 @@ import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class IndexSpansWriter implements AutoCloseable {
|
||||
private final FileChannel outputChannel;
|
||||
private final ByteBuffer work = ByteBuffer.allocate(65536).order(ByteOrder.nativeOrder());
|
||||
private final ByteBuffer work = ByteBuffer.allocate(4*1024*1024).order(ByteOrder.nativeOrder());
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(IndexSpansWriter.class);
|
||||
|
||||
private long stateStartOffset = -1;
|
||||
private int stateLength = -1;
|
||||
@@ -38,10 +42,17 @@ public class IndexSpansWriter implements AutoCloseable {
|
||||
work.put(spanCode);
|
||||
work.put((byte) 0); // Ensure we're byte aligned
|
||||
var sequence = new VarintCodedSequence(sequenceData);
|
||||
work.putShort((short) sequence.valueCount());
|
||||
|
||||
int spanLength = sequence.valueCount();
|
||||
|
||||
if (spanLength > 8192) {
|
||||
logger.warn("Excessive span length with code {}: {}", spanCode, spanLength);
|
||||
spanLength = 8192;
|
||||
}
|
||||
work.putShort((short) spanLength);
|
||||
|
||||
var iter = sequence.iterator();
|
||||
while (iter.hasNext()) {
|
||||
for (int spanIdx = 0; iter.hasNext() && spanIdx < spanLength; spanIdx++) {
|
||||
work.putInt(iter.nextInt());
|
||||
}
|
||||
work.flip();
|
||||
@@ -55,7 +66,7 @@ public class IndexSpansWriter implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN);
|
||||
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN, (int) (4096 - (outputChannel.position() & 4095)));
|
||||
outputChannel.position(outputChannel.size());
|
||||
while (footer.hasRemaining()) {
|
||||
outputChannel.write(footer, outputChannel.size());
|
@@ -10,9 +10,9 @@ public class SpansCodec {
|
||||
public static int MAGIC_INT = 0xF000F000;
|
||||
public static int FOOTER_SIZE = 8;
|
||||
|
||||
enum SpansCodecVersion {
|
||||
public enum SpansCodecVersion {
|
||||
@Deprecated
|
||||
COMPRESSED,
|
||||
DEPRECATED_1, // This must not be removed, the ordinal is used to encode the version
|
||||
PLAIN
|
||||
}
|
||||
|
||||
@@ -26,12 +26,17 @@ public class SpansCodec {
|
||||
return encoded >>> 28;
|
||||
}
|
||||
|
||||
public static long decodeSize(long encoded) {
|
||||
return encoded & 0x0FFF_FFFFL;
|
||||
public static int decodeSize(long encoded) {
|
||||
return (int) (encoded & 0x0FFF_FFFFL);
|
||||
}
|
||||
|
||||
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version) {
|
||||
ByteBuffer footer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version, int padSize) {
|
||||
if (padSize < FOOTER_SIZE) {
|
||||
padSize += 4096;
|
||||
}
|
||||
|
||||
ByteBuffer footer = ByteBuffer.allocate(padSize);
|
||||
footer.position(padSize - FOOTER_SIZE);
|
||||
footer.putInt(SpansCodec.MAGIC_INT);
|
||||
footer.put((byte) version.ordinal());
|
||||
footer.put((byte) 0);
|
@@ -1,15 +1,14 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
/** A list of document ids, with their ranking bits still remaining.
|
||||
*
|
||||
* @see nu.marginalia.index.results.model.ids.DocIdList
|
||||
* @see DocIdList
|
||||
* @see nu.marginalia.model.id.UrlIdCodec
|
||||
* */
|
||||
public final class CombinedDocIdList {
|
||||
@@ -24,13 +23,17 @@ public final class CombinedDocIdList {
|
||||
public CombinedDocIdList(LongArrayList data) {
|
||||
this.data = data.toLongArray();
|
||||
}
|
||||
public CombinedDocIdList(Roaring64Bitmap data) {
|
||||
this.data = data.toArray();
|
||||
}
|
||||
public CombinedDocIdList() {
|
||||
this.data = new long[0];
|
||||
}
|
||||
|
||||
public static CombinedDocIdList combineLists(CombinedDocIdList one, CombinedDocIdList other) {
|
||||
long[] data = new long[one.size() + other.size()];
|
||||
System.arraycopy(one.data, 0, data, 0, one.data.length);
|
||||
System.arraycopy(other.data, 0, data, one.data.length, other.data.length);
|
||||
return new CombinedDocIdList(data);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.length;
|
||||
}
|
@@ -1,14 +1,13 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
/** A list of document ids, with their ranking bits removed.
|
||||
*
|
||||
* @see nu.marginalia.index.results.model.ids.CombinedDocIdList
|
||||
* @see CombinedDocIdList
|
||||
* @see nu.marginalia.model.id.UrlIdCodec
|
||||
* */
|
||||
public final class DocIdList {
|
@@ -1,10 +1,9 @@
|
||||
package nu.marginalia.index.results.model;
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
@@ -61,7 +60,7 @@ public class PhraseConstraintGroupList {
|
||||
private final int presentCardinality;
|
||||
|
||||
public final int size;
|
||||
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
|
||||
public PhraseConstraintGroup(KeywordHasher hasher, List<String> terms, TermIdList termIdsAll) {
|
||||
offsets = new int[terms.size()];
|
||||
present = new BitSet(terms.size());
|
||||
size = terms.size();
|
||||
@@ -75,7 +74,7 @@ public class PhraseConstraintGroupList {
|
||||
}
|
||||
|
||||
present.set(i);
|
||||
long termId = SearchTermsUtil.getWordId(term);
|
||||
long termId = hasher.hashKeyword(term);
|
||||
|
||||
int idx = termIdsAll.indexOf(termId);
|
||||
if (idx < 0) {
|
@@ -1,8 +1,8 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimit;
|
||||
import nu.marginalia.api.searchquery.model.query.SpecificationLimitType;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
|
||||
import java.util.Objects;
|
||||
|
@@ -1,106 +0,0 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final RpcResultRankingParameters params;
|
||||
public final SearchQuery searchQuery;
|
||||
public final QueryParams queryParams;
|
||||
|
||||
public final CompiledQuery<String> compiledQuery;
|
||||
public final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public final BitSet regularMask;
|
||||
public final BitSet ngramsMask;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt fullCounts;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt priorityCounts;
|
||||
|
||||
public static ResultRankingContext create(CombinedIndexReader currentIndex, SearchParameters searchParameters) {
|
||||
|
||||
var compiledQueryIds = searchParameters.compiledQueryIds;
|
||||
var compiledQuery = searchParameters.compiledQuery;
|
||||
|
||||
int[] full = new int[compiledQueryIds.size()];
|
||||
int[] prio = new int[compiledQueryIds.size()];
|
||||
|
||||
BitSet ngramsMask = new BitSet(compiledQuery.size());
|
||||
BitSet regularMask = new BitSet(compiledQuery.size());
|
||||
|
||||
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
||||
long id = compiledQueryIds.at(idx);
|
||||
full[idx] = currentIndex.numHits(id);
|
||||
prio[idx] = currentIndex.numHitsPrio(id);
|
||||
|
||||
if (compiledQuery.at(idx).contains("_")) {
|
||||
ngramsMask.set(idx);
|
||||
}
|
||||
else {
|
||||
regularMask.set(idx);
|
||||
}
|
||||
}
|
||||
|
||||
return new ResultRankingContext(currentIndex.totalDocCount(),
|
||||
searchParameters,
|
||||
compiledQuery,
|
||||
compiledQueryIds,
|
||||
ngramsMask,
|
||||
regularMask,
|
||||
new CqDataInt(full),
|
||||
new CqDataInt(prio));
|
||||
}
|
||||
|
||||
public ResultRankingContext(int docCount,
|
||||
SearchParameters searchParameters,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
CompiledQueryLong compiledQueryIds,
|
||||
BitSet ngramsMask,
|
||||
BitSet regularMask,
|
||||
CqDataInt fullCounts,
|
||||
CqDataInt prioCounts)
|
||||
{
|
||||
this.docCount = docCount;
|
||||
|
||||
this.searchQuery = searchParameters.query;
|
||||
this.params = searchParameters.rankingParams;
|
||||
this.queryParams = searchParameters.queryParams;
|
||||
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.compiledQueryIds = compiledQueryIds;
|
||||
|
||||
this.ngramsMask = ngramsMask;
|
||||
this.regularMask = regularMask;
|
||||
|
||||
this.fullCounts = fullCounts;
|
||||
this.priorityCounts = prioCounts;
|
||||
}
|
||||
|
||||
public int termFreqDocCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultRankingContext{" +
|
||||
"docCount=" + docCount +
|
||||
", params=" + params +
|
||||
", regularMask=" + regularMask +
|
||||
", ngramsMask=" + ngramsMask +
|
||||
", fullCounts=" + fullCounts +
|
||||
", priorityCounts=" + priorityCounts +
|
||||
'}';
|
||||
}
|
||||
}
|
243
code/index/java/nu/marginalia/index/model/SearchContext.java
Normal file
243
code/index/java/nu/marginalia/index/model/SearchContext.java
Normal file
@@ -0,0 +1,243 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongComparator;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.reverse.IndexLanguageContext;
|
||||
import nu.marginalia.index.reverse.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
||||
|
||||
public class SearchContext {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchContext.class);
|
||||
|
||||
public final IndexSearchBudget budget;
|
||||
|
||||
public final int fetchSize;
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
|
||||
private final int docCount;
|
||||
|
||||
public final RpcResultRankingParameters params;
|
||||
public final SearchQuery searchQuery;
|
||||
public final QueryParams queryParams;
|
||||
|
||||
public final CompiledQuery<String> compiledQuery;
|
||||
public final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
/** Bitmask whose position correspond to the positions in the compiled query data
|
||||
* which are regular words.
|
||||
*/
|
||||
public final BitSet regularMask;
|
||||
|
||||
/** Bitmask whose position correspond to the positions in the compiled query data
|
||||
* which are ngrams.
|
||||
*/
|
||||
public final BitSet ngramsMask;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt fullCounts;
|
||||
|
||||
/** CqDataInt associated with frequency information of the terms in the query
|
||||
* in the full index. The dataset is indexed by the compiled query. */
|
||||
public final CqDataInt priorityCounts;
|
||||
|
||||
public final TermIdList termIdsAll;
|
||||
public final PhraseConstraintGroupList phraseConstraints;
|
||||
|
||||
public final LongList termIdsAdvice;
|
||||
public final LongList termIdsExcludes;
|
||||
public final LongList termIdsPriority;
|
||||
|
||||
public final IndexLanguageContext languageContext;
|
||||
|
||||
public static SearchContext create(CombinedIndexReader currentIndex,
|
||||
KeywordHasher keywordHasher,
|
||||
SearchSpecification specsSet,
|
||||
SearchSet searchSet) {
|
||||
|
||||
var queryParams = new QueryParams(specsSet.quality, specsSet.year, specsSet.size, specsSet.rank, searchSet, specsSet.queryStrategy);
|
||||
var rankingParams = specsSet.rankingParams;
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
return new SearchContext(
|
||||
keywordHasher,
|
||||
"en", // FIXME: This path currently only supports english
|
||||
currentIndex,
|
||||
specsSet.query.compiledQuery,
|
||||
queryParams,
|
||||
specsSet.query,
|
||||
rankingParams,
|
||||
limits);
|
||||
}
|
||||
|
||||
public static SearchContext create(CombinedIndexReader currentIndex,
|
||||
KeywordHasher keywordHasher,
|
||||
RpcIndexQuery request, SearchSet searchSet) {
|
||||
var limits = request.getQueryLimits();
|
||||
var query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||
|
||||
var queryParams = new QueryParams(
|
||||
convertSpecLimit(request.getQuality()),
|
||||
convertSpecLimit(request.getYear()),
|
||||
convertSpecLimit(request.getSize()),
|
||||
convertSpecLimit(request.getRank()),
|
||||
searchSet,
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()));
|
||||
|
||||
var rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
|
||||
|
||||
return new SearchContext(
|
||||
keywordHasher,
|
||||
request.getLangIsoCode(),
|
||||
currentIndex,
|
||||
query.compiledQuery,
|
||||
queryParams,
|
||||
query,
|
||||
rankingParams,
|
||||
limits);
|
||||
}
|
||||
|
||||
public SearchContext(
|
||||
KeywordHasher keywordHasher,
|
||||
String langIsoCode,
|
||||
CombinedIndexReader currentIndex,
|
||||
String queryExpression,
|
||||
QueryParams queryParams,
|
||||
SearchQuery query,
|
||||
RpcResultRankingParameters rankingParams,
|
||||
RpcQueryLimits limits)
|
||||
{
|
||||
this.docCount = currentIndex.totalDocCount();
|
||||
this.languageContext = currentIndex.createLanguageContext(langIsoCode);
|
||||
|
||||
this.budget = new IndexSearchBudget(Math.max(limits.getTimeoutMs()/2, limits.getTimeoutMs()-50));
|
||||
this.searchQuery = query;
|
||||
this.params = rankingParams;
|
||||
this.queryParams = queryParams;
|
||||
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
|
||||
|
||||
this.compiledQuery = CompiledQueryParser.parse(queryExpression);
|
||||
this.compiledQueryIds = compiledQuery.mapToLong(keywordHasher::hashKeyword);
|
||||
int[] full = new int[compiledQueryIds.size()];
|
||||
int[] prio = new int[compiledQueryIds.size()];
|
||||
|
||||
this.ngramsMask = new BitSet(compiledQuery.size());
|
||||
this.regularMask = new BitSet(compiledQuery.size());
|
||||
|
||||
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
|
||||
long id = compiledQueryIds.at(idx);
|
||||
full[idx] = currentIndex.numHits(this.languageContext, id);
|
||||
prio[idx] = currentIndex.numHitsPrio(this.languageContext, id);
|
||||
|
||||
if (compiledQuery.at(idx).contains("_")) {
|
||||
ngramsMask.set(idx);
|
||||
}
|
||||
else {
|
||||
regularMask.set(idx);
|
||||
}
|
||||
}
|
||||
|
||||
this.fullCounts = new CqDataInt(full);
|
||||
this.priorityCounts = new CqDataInt(prio);
|
||||
|
||||
this.termIdsExcludes = new LongArrayList();
|
||||
this.termIdsPriority = new LongArrayList();
|
||||
this.termIdsAdvice = new LongArrayList();
|
||||
|
||||
for (var word : searchQuery.searchTermsAdvice) {
|
||||
termIdsAdvice.add(keywordHasher.hashKeyword(word));
|
||||
}
|
||||
|
||||
for (var word : searchQuery.searchTermsExclude) {
|
||||
termIdsExcludes.add(keywordHasher.hashKeyword(word));
|
||||
}
|
||||
|
||||
for (var word : searchQuery.searchTermsPriority) {
|
||||
termIdsPriority.add(keywordHasher.hashKeyword(word));
|
||||
}
|
||||
|
||||
LongArrayList termIdsList = new LongArrayList();
|
||||
TObjectLongHashMap<Object> termToId = new TObjectLongHashMap<>();
|
||||
|
||||
for (String word : compiledQuery) {
|
||||
long id = keywordHasher.hashKeyword(word);
|
||||
termIdsList.add(id);
|
||||
termToId.put(word, id);
|
||||
}
|
||||
|
||||
for (var term : searchQuery.searchTermsPriority) {
|
||||
if (termToId.containsKey(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
long id = keywordHasher.hashKeyword(term);
|
||||
termIdsList.add(id);
|
||||
termToId.put(term, id);
|
||||
}
|
||||
|
||||
termIdsAll = new TermIdList(termIdsList);
|
||||
|
||||
var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
|
||||
for (var constraint : searchQuery.phraseConstraints) {
|
||||
switch (constraint) {
|
||||
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
|
||||
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
|
||||
case SearchPhraseConstraint.Optional(List<String> terms) ->
|
||||
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
|
||||
case SearchPhraseConstraint.Full(List<String> terms) ->
|
||||
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, terms, termIdsAll));
|
||||
}
|
||||
}
|
||||
|
||||
if (constraintsFull.isEmpty()) {
|
||||
logger.warn("No full constraints in query, adding empty group");
|
||||
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(keywordHasher, List.of(), termIdsAll));
|
||||
}
|
||||
|
||||
this.phraseConstraints = new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional);
|
||||
}
|
||||
|
||||
public int termFreqDocCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
public long[] sortedDistinctIncludes(LongComparator comparator) {
|
||||
LongList list = new LongArrayList(compiledQueryIds.copyData());
|
||||
list.sort(comparator);
|
||||
return list.toLongArray();
|
||||
}
|
||||
|
||||
}
|
@@ -1,97 +0,0 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.searchset.SearchSet;
|
||||
|
||||
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
||||
|
||||
public class SearchParameters {
|
||||
/**
|
||||
* This is how many results matching the keywords we'll try to get
|
||||
* before evaluating them for the best result.
|
||||
*/
|
||||
public final int fetchSize;
|
||||
public final IndexSearchBudget budget;
|
||||
public final SearchQuery query;
|
||||
public final QueryParams queryParams;
|
||||
public final RpcResultRankingParameters rankingParams;
|
||||
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
|
||||
public final CompiledQuery<String> compiledQuery;
|
||||
public final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
// mutable:
|
||||
|
||||
/**
|
||||
* An estimate of how much data has been read
|
||||
*/
|
||||
public long dataCost = 0;
|
||||
|
||||
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs());
|
||||
this.query = specsSet.query;
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
|
||||
queryParams = new QueryParams(
|
||||
specsSet.quality,
|
||||
specsSet.year,
|
||||
specsSet.size,
|
||||
specsSet.rank,
|
||||
searchSet,
|
||||
specsSet.queryStrategy);
|
||||
|
||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||
|
||||
rankingParams = specsSet.rankingParams;
|
||||
}
|
||||
|
||||
public SearchParameters(RpcIndexQuery request, SearchSet searchSet) {
|
||||
var limits = request.getQueryLimits();
|
||||
|
||||
this.fetchSize = limits.getFetchSize();
|
||||
|
||||
// The time budget is halved because this is the point when we start to
|
||||
// wrap up the search and return the results.
|
||||
this.budget = new IndexSearchBudget(limits.getTimeoutMs() / 2);
|
||||
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
|
||||
|
||||
this.limitByDomain = limits.getResultsByDomain();
|
||||
this.limitTotal = limits.getResultsTotal();
|
||||
|
||||
queryParams = new QueryParams(
|
||||
convertSpecLimit(request.getQuality()),
|
||||
convertSpecLimit(request.getYear()),
|
||||
convertSpecLimit(request.getSize()),
|
||||
convertSpecLimit(request.getRank()),
|
||||
searchSet,
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()));
|
||||
|
||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||
|
||||
rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
|
||||
}
|
||||
|
||||
|
||||
public long getDataCost() {
|
||||
return dataCost;
|
||||
}
|
||||
|
||||
}
|
@@ -1,72 +0,0 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongComparator;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
|
||||
|
||||
public final class SearchTerms {
|
||||
private final LongList advice;
|
||||
private final LongList excludes;
|
||||
private final LongList priority;
|
||||
|
||||
public static final LongArraySet stopWords = new LongArraySet(
|
||||
new long[] {
|
||||
getWordId("a"),
|
||||
getWordId("an"),
|
||||
getWordId("the"),
|
||||
}
|
||||
);
|
||||
|
||||
private final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public SearchTerms(SearchQuery query,
|
||||
CompiledQueryLong compiledQueryIds)
|
||||
{
|
||||
this.excludes = new LongArrayList();
|
||||
this.priority = new LongArrayList();
|
||||
|
||||
this.advice = new LongArrayList();
|
||||
this.compiledQueryIds = compiledQueryIds;
|
||||
|
||||
for (var word : query.searchTermsAdvice) {
|
||||
advice.add(getWordId(word));
|
||||
}
|
||||
|
||||
for (var word : query.searchTermsExclude) {
|
||||
excludes.add(getWordId(word));
|
||||
}
|
||||
|
||||
for (var word : query.searchTermsPriority) {
|
||||
priority.add(getWordId(word));
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return compiledQueryIds.isEmpty();
|
||||
}
|
||||
|
||||
public long[] sortedDistinctIncludes(LongComparator comparator) {
|
||||
LongList list = new LongArrayList(compiledQueryIds.copyData());
|
||||
list.sort(comparator);
|
||||
return list.toLongArray();
|
||||
}
|
||||
|
||||
|
||||
public LongList excludes() {
|
||||
return excludes;
|
||||
}
|
||||
public LongList advice() {
|
||||
return advice;
|
||||
}
|
||||
public LongList priority() {
|
||||
return priority;
|
||||
}
|
||||
|
||||
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
|
||||
|
||||
}
|
@@ -1,13 +0,0 @@
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
|
||||
public class SearchTermsUtil {
|
||||
|
||||
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
/** Translate the word to a unique id. */
|
||||
public static long getWordId(String s) {
|
||||
return hasher.hashKeyword(s);
|
||||
}
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
|
||||
@@ -6,7 +6,7 @@ import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public final class TermIdList {
|
||||
private final long[] array;
|
||||
public final long[] array;
|
||||
|
||||
public TermIdList(long[] array) {
|
||||
this.array = array;
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
package nu.marginalia.index.model;
|
||||
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.reverse.positions.TermData;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
import javax.annotation.Nullable;
|
@@ -2,7 +2,7 @@ package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
@@ -26,7 +26,7 @@ public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
|
||||
public Bm25GraphVisitor(double k1, double b,
|
||||
float[] counts,
|
||||
int length,
|
||||
ResultRankingContext ctx) {
|
||||
SearchContext ctx) {
|
||||
this.length = length;
|
||||
|
||||
this.k1 = k1;
|
||||
|
@@ -4,27 +4,18 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.ResultPriorityQueue;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.StatefulIndex;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.index.model.TermMetadataList;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
@@ -37,6 +28,8 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
@Singleton
|
||||
public class IndexResultRankingService {
|
||||
@@ -56,87 +49,131 @@ public class IndexResultRankingService {
|
||||
this.domainRankingOverrides = domainRankingOverrides;
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds,
|
||||
boolean exportDebugData)
|
||||
{
|
||||
if (resultIds.isEmpty())
|
||||
return List.of();
|
||||
public RankingData prepareRankingData(SearchContext rankingContext, CombinedDocIdList resultIds) throws TimeoutException {
|
||||
return new RankingData(rankingContext, resultIds);
|
||||
}
|
||||
|
||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
|
||||
public final class RankingData implements AutoCloseable {
|
||||
final Arena arena;
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||
private final TermMetadataList[] termsForDocs;
|
||||
private final DocumentSpans[] documentSpans;
|
||||
private final long[] flags;
|
||||
private final CodedSequence[] positions;
|
||||
private final CombinedDocIdList resultIds;
|
||||
private AtomicBoolean closed = new AtomicBoolean(false);
|
||||
int pos = -1;
|
||||
|
||||
// Get the current index reader, which is the one we'll use for this calculation,
|
||||
// this may change during the calculation, but we don't want to switch over mid-calculation
|
||||
final CombinedIndexReader currentIndex = statefulIndex.get();
|
||||
public RankingData(SearchContext rankingContext, CombinedDocIdList resultIds) throws TimeoutException {
|
||||
this.resultIds = resultIds;
|
||||
this.arena = Arena.ofShared();
|
||||
|
||||
final QuerySearchTerms searchTerms = getSearchTerms(rankingContext.compiledQuery, rankingContext.searchQuery);
|
||||
final int termCount = searchTerms.termIdsAll.size();
|
||||
final int termCount = rankingContext.termIdsAll.size();
|
||||
|
||||
// We use an arena for the position and spans data to limit gc pressure
|
||||
try (var arena = Arena.ofShared()) {
|
||||
this.flags = new long[termCount];
|
||||
this.positions = new CodedSequence[termCount];
|
||||
|
||||
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
|
||||
for (int ti = 0; ti < termCount; ti++) {
|
||||
termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds);
|
||||
// Get the current index reader, which is the one we'll use for this calculation,
|
||||
// this may change during the calculation, but we don't want to switch over mid-calculation
|
||||
|
||||
final CombinedIndexReader currentIndex = statefulIndex.get();
|
||||
|
||||
// Perform expensive I/O operations
|
||||
|
||||
try {
|
||||
this.termsForDocs = currentIndex.getTermMetadata(arena, rankingContext.languageContext, rankingContext.budget, rankingContext.termIdsAll.array, resultIds);
|
||||
this.documentSpans = currentIndex.getDocumentSpans(arena, rankingContext.budget, resultIds);
|
||||
}
|
||||
catch (TimeoutException|RuntimeException ex) {
|
||||
arena.close();
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
// Data for the document. We arrange this in arrays outside the calculation function to avoid
|
||||
// hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache
|
||||
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
|
||||
public CodedSequence[] positions() {
|
||||
return positions;
|
||||
}
|
||||
public long[] flags() {
|
||||
return flags;
|
||||
}
|
||||
public long resultId() {
|
||||
return resultIds.at(pos);
|
||||
}
|
||||
public DocumentSpans documentSpans() {
|
||||
return documentSpans[pos];
|
||||
}
|
||||
|
||||
long[] flags = new long[termCount];
|
||||
CodedSequence[] positions = new CodedSequence[termCount];
|
||||
DocumentSpans[] documentSpans = currentIndex.getDocumentSpans(arena, resultIds);
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
|
||||
for (int i = 0; i < resultIds.size(); i++) {
|
||||
|
||||
// Prepare term-level data for the document
|
||||
public boolean next() {
|
||||
if (++pos < resultIds.size()) {
|
||||
for (int ti = 0; ti < flags.length; ti++) {
|
||||
var tfd = termsForDocs[ti];
|
||||
|
||||
assert tfd != null : "No term data for term " + ti;
|
||||
|
||||
flags[ti] = tfd.flag(i);
|
||||
positions[ti] = tfd.position(i);
|
||||
flags[ti] = tfd.flag(pos);
|
||||
positions[ti] = tfd.position(pos);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ignore documents that don't match the mandatory constraints
|
||||
if (!searchTerms.phraseConstraints.testMandatory(positions)) {
|
||||
continue;
|
||||
}
|
||||
public int size() {
|
||||
return resultIds.size();
|
||||
}
|
||||
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(null, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore( rankingFactors, resultIds.at(i), searchTerms, flags, positions, documentSpans[i]);
|
||||
public void close() {
|
||||
if (closed.compareAndSet(false, true)) {
|
||||
arena.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(
|
||||
SearchContext rankingContext,
|
||||
RankingData rankingData,
|
||||
boolean exportDebugData)
|
||||
{
|
||||
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, domainRankingOverrides, rankingContext);
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(rankingData.size());
|
||||
|
||||
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
|
||||
// term data arrays as well
|
||||
|
||||
while (rankingData.next() && rankingContext.budget.hasTimeLeft()) {
|
||||
|
||||
// Ignore documents that don't match the mandatory constraints
|
||||
if (!rankingContext.phraseConstraints.testMandatory(rankingData.positions())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return results;
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(null, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore( rankingFactors, rankingData.resultId(), rankingContext, rankingData.flags(), rankingData.positions(), rankingData.documentSpans());
|
||||
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
public List<RpcDecoratedResultItem> selectBestResults(int limitByDomain,
|
||||
int limitTotal,
|
||||
ResultRankingContext resultRankingContext,
|
||||
ResultPriorityQueue results) throws SQLException {
|
||||
SearchContext searchContext,
|
||||
List<SearchResultItem> results) throws SQLException {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(limitByDomain);
|
||||
|
||||
@@ -164,18 +201,25 @@ public class IndexResultRankingService {
|
||||
// for the selected results, as this would be comically expensive to do for all the results we
|
||||
// discard along the way
|
||||
|
||||
if (resultRankingContext.params.getExportDebugData()) {
|
||||
if (searchContext.params.getExportDebugData()) {
|
||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||
for (var item : resultsList) {
|
||||
combinedIdsList.add(item.combinedId);
|
||||
}
|
||||
|
||||
resultsList.clear();
|
||||
resultsList.addAll(this.rankResults(
|
||||
resultRankingContext,
|
||||
new CombinedDocIdList(combinedIdsList),
|
||||
true)
|
||||
);
|
||||
|
||||
try (var data = prepareRankingData(searchContext, new CombinedDocIdList(combinedIdsList))) {
|
||||
resultsList.addAll(this.rankResults(
|
||||
searchContext,
|
||||
data,
|
||||
true)
|
||||
);
|
||||
}
|
||||
catch (TimeoutException ex) {
|
||||
// this won't happen since we passed null for budget
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Fetch the document details for the selected results in one go, from the local document database
|
||||
@@ -251,7 +295,7 @@ public class IndexResultRankingService {
|
||||
|
||||
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
||||
|
||||
CqDataLong termIds = resultRankingContext.compiledQueryIds.data;
|
||||
CqDataLong termIds = searchContext.compiledQueryIds.data;
|
||||
|
||||
for (var entry : debugFactors.getTermFactors()) {
|
||||
String term = "[ERROR IN LOOKUP]";
|
||||
@@ -259,7 +303,7 @@ public class IndexResultRankingService {
|
||||
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
||||
for (int i = 0; i < termIds.size(); i++) {
|
||||
if (termIds.get(i) == entry.termId()) {
|
||||
term = resultRankingContext.compiledQuery.at(i);
|
||||
term = searchContext.compiledQuery.at(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -282,54 +326,5 @@ public class IndexResultRankingService {
|
||||
}
|
||||
|
||||
|
||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||
|
||||
LongArrayList termIdsList = new LongArrayList();
|
||||
|
||||
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
||||
|
||||
for (String word : compiledQuery) {
|
||||
long id = SearchTermsUtil.getWordId(word);
|
||||
termIdsList.add(id);
|
||||
termToId.put(word, id);
|
||||
}
|
||||
|
||||
for (var term : searchQuery.searchTermsPriority) {
|
||||
if (termToId.containsKey(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
long id = SearchTermsUtil.getWordId(term);
|
||||
termIdsList.add(id);
|
||||
termToId.put(term, id);
|
||||
}
|
||||
|
||||
var idsAll = new TermIdList(termIdsList);
|
||||
|
||||
var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
|
||||
for (var constraint : searchQuery.phraseConstraints) {
|
||||
switch (constraint) {
|
||||
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
|
||||
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
case SearchPhraseConstraint.Optional(List<String> terms) ->
|
||||
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
case SearchPhraseConstraint.Full(List<String> terms) ->
|
||||
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
}
|
||||
}
|
||||
|
||||
if (constraintsFull.isEmpty()) {
|
||||
logger.warn("No full constraints in query, adding empty group");
|
||||
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(List.of(), idsAll));
|
||||
}
|
||||
|
||||
|
||||
return new QuerySearchTerms(termToId,
|
||||
idsAll,
|
||||
new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@@ -6,16 +6,15 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryStrategy;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.StatefulIndex;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
@@ -40,12 +39,12 @@ public class IndexResultScoreCalculator {
|
||||
private final QueryParams queryParams;
|
||||
|
||||
private final DomainRankingOverrides domainRankingOverrides;
|
||||
private final ResultRankingContext rankingContext;
|
||||
private final SearchContext rankingContext;
|
||||
private final CompiledQuery<String> compiledQuery;
|
||||
|
||||
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
|
||||
DomainRankingOverrides domainRankingOverrides,
|
||||
ResultRankingContext rankingContext)
|
||||
SearchContext rankingContext)
|
||||
{
|
||||
this.index = statefulIndex.get();
|
||||
this.domainRankingOverrides = domainRankingOverrides;
|
||||
@@ -58,7 +57,7 @@ public class IndexResultScoreCalculator {
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(@Nullable DebugRankingFactors debugRankingFactors,
|
||||
long combinedId,
|
||||
QuerySearchTerms searchTerms,
|
||||
SearchContext rankingContext,
|
||||
long[] wordFlags,
|
||||
CodedSequence[] positions,
|
||||
DocumentSpans spans)
|
||||
@@ -106,23 +105,23 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
}
|
||||
|
||||
var params = rankingContext.params;
|
||||
var params = this.rankingContext.params;
|
||||
|
||||
double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors);
|
||||
|
||||
VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, searchTerms.phraseConstraints, spans);
|
||||
UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, rankingContext.regularMask, spans);
|
||||
VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, rankingContext.phraseConstraints, spans);
|
||||
UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, this.rankingContext.regularMask, spans);
|
||||
|
||||
float proximitiyFac = getProximitiyFac(decodedPositions, searchTerms.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||
float proximitiyFac = getProximitiyFac(decodedPositions, rankingContext.phraseConstraints, verbatimMatches, unorderedMatches, spans);
|
||||
|
||||
double score_firstPosition = params.getTcfFirstPositionWeight() * (1.0 / Math.sqrt(unorderedMatches.firstPosition));
|
||||
double score_verbatim = params.getTcfVerbatimWeight() * verbatimMatches.getScore();
|
||||
double score_proximity = params.getTcfProximityWeight() * proximitiyFac;
|
||||
double score_bM25 = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, rankingContext))
|
||||
* wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.getBm25K(), params.getBm25B(), unorderedMatches.getWeightedCounts(), docSize, this.rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
double score_bFlags = params.getBm25Weight()
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext))
|
||||
* wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.getBm25K(), wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), this.rankingContext))
|
||||
/ (Math.sqrt(unorderedMatches.searchableKeywordCount + 1));
|
||||
|
||||
double rankingAdjustment = domainRankingOverrides.getRankingFactor(UrlIdCodec.getDomainId(combinedId));
|
||||
@@ -147,8 +146,8 @@ public class IndexResultScoreCalculator {
|
||||
debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity));
|
||||
debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition));
|
||||
|
||||
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
||||
long termId = searchTerms.termIdsAll.at(i);
|
||||
for (int i = 0; i < rankingContext.termIdsAll.size(); i++) {
|
||||
long termId = rankingContext.termIdsAll.at(i);
|
||||
|
||||
var flags = wordFlagsQuery.at(i);
|
||||
|
||||
@@ -183,7 +182,7 @@ public class IndexResultScoreCalculator {
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
score,
|
||||
calculatePositionsMask(decodedPositions, searchTerms.phraseConstraints)
|
||||
calculatePositionsMask(decodedPositions, rankingContext.phraseConstraints)
|
||||
);
|
||||
}
|
||||
|
||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.index.results;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
import nu.marginalia.index.model.ResultRankingContext;
|
||||
import nu.marginalia.index.model.SearchContext;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.List;
|
||||
@@ -20,7 +20,7 @@ public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
public TermFlagsGraphVisitor(double k1,
|
||||
CqDataLong wordMetaData,
|
||||
float[] counts,
|
||||
ResultRankingContext ctx) {
|
||||
SearchContext ctx) {
|
||||
this.k1 = k1;
|
||||
this.counts = counts;
|
||||
this.docCount = ctx.termFreqDocCount();
|
||||
|
@@ -1,23 +0,0 @@
|
||||
package nu.marginalia.index.results.model;
|
||||
|
||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
|
||||
public class QuerySearchTerms {
|
||||
private final TObjectLongHashMap<String> termToId;
|
||||
public final TermIdList termIdsAll;
|
||||
|
||||
public final PhraseConstraintGroupList phraseConstraints;
|
||||
|
||||
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
|
||||
TermIdList termIdsAll,
|
||||
PhraseConstraintGroupList phraseConstraints) {
|
||||
this.termToId = termToId;
|
||||
this.termIdsAll = termIdsAll;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
public long getIdForTerm(String searchTerm) {
|
||||
return termToId.get(searchTerm);
|
||||
}
|
||||
}
|
@@ -1,66 +0,0 @@
|
||||
package nu.marginalia.index.results.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class TermMetadataForCombinedDocumentIds {
|
||||
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
|
||||
|
||||
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
|
||||
this.termdocToMeta = termdocToMeta;
|
||||
}
|
||||
|
||||
public byte getTermMetadata(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
if (metaByCombinedId == null) {
|
||||
return 0;
|
||||
}
|
||||
return metaByCombinedId.get(combinedId).flags();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public CodedSequence getPositions(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
|
||||
if (metaByCombinedId == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return metaByCombinedId.get(combinedId).positions();
|
||||
}
|
||||
|
||||
public boolean hasTermMeta(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
|
||||
if (metaByCombinedId == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return metaByCombinedId.data().containsKey(combinedId);
|
||||
}
|
||||
|
||||
public record DocumentsWithMetadata(Long2ObjectOpenHashMap<TermData> data) {
|
||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) {
|
||||
this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size()));
|
||||
|
||||
long[] ids = combinedDocIdsAll.array();
|
||||
TermData[] data = metadata.array();
|
||||
|
||||
for (int i = 0; i < combinedDocIdsAll.size(); i++) {
|
||||
if (data[i] != null) {
|
||||
this.data.put(ids[i], data[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public TermData get(long combinedId) {
|
||||
return data.get(combinedId);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,35 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.reverse.query.EntrySource;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
|
||||
public class FullIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
|
||||
private final SkipListReader reader;
|
||||
private final long wordId;
|
||||
|
||||
public FullIndexEntrySource(String name,
|
||||
SkipListReader reader,
|
||||
long wordId) {
|
||||
this.name = name;
|
||||
this.reader = reader;
|
||||
this.wordId = wordId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
reader.getKeys(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMore() {
|
||||
return !reader.atEnd();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String indexName() {
|
||||
return name + ":" + Long.toHexString(wordId);
|
||||
}
|
||||
}
|
@@ -0,0 +1,227 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.array.pool.BufferPool;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.model.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.TermMetadataList;
|
||||
import nu.marginalia.index.reverse.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.reverse.positions.TermData;
|
||||
import nu.marginalia.index.reverse.query.*;
|
||||
import nu.marginalia.index.reverse.query.filter.QueryFilterLetThrough;
|
||||
import nu.marginalia.index.reverse.query.filter.QueryFilterNoPass;
|
||||
import nu.marginalia.index.reverse.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.skiplist.SkipListConstants;
|
||||
import nu.marginalia.skiplist.SkipListReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class FullReverseIndexReader {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Map<String, WordLexicon> wordLexiconMap;
|
||||
|
||||
private final LongArray documents;
|
||||
private final PositionsFileReader positionsFileReader;
|
||||
private final BufferPool dataPool;
|
||||
private final String name;
|
||||
|
||||
public FullReverseIndexReader(String name,
|
||||
Collection<WordLexicon> wordLexicons,
|
||||
Path documents,
|
||||
Path positionsFile)
|
||||
throws IOException
|
||||
{
|
||||
this.name = name;
|
||||
|
||||
if (!Files.exists(documents)) {
|
||||
this.documents = null;
|
||||
this.dataPool = null;
|
||||
this.positionsFileReader = null;
|
||||
this.wordLexiconMap = Map.of();
|
||||
|
||||
wordLexicons.forEach(WordLexicon::close);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
this.wordLexiconMap = wordLexicons.stream().collect(Collectors.toUnmodifiableMap(lexicon -> lexicon.languageIsoCode, v->v));
|
||||
this.positionsFileReader = new PositionsFileReader(positionsFile);
|
||||
|
||||
logger.info("Switching reverse index");
|
||||
|
||||
this.documents = LongArrayFactory.mmapForReadingShared(documents);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(this.documents.getMemorySegment());
|
||||
|
||||
dataPool = new BufferPool(documents, SkipListConstants.BLOCK_SIZE,
|
||||
(int) (Long.getLong("index.bufferPoolSize", 512*1024*1024L) / SkipListConstants.BLOCK_SIZE)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
dataPool.reset();
|
||||
}
|
||||
|
||||
|
||||
public EntrySource documents(IndexLanguageContext languageContext, long termId) {
|
||||
if (null == languageContext.wordLexiconFull) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
long offset = languageContext.wordLexiconFull.wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
|
||||
return new FullIndexEntrySource(name, getReader(offset), termId);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to exist in the documents */
|
||||
public QueryFilterStepIf also(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
|
||||
var lexicon = languageContext.wordLexiconFull;
|
||||
if (null == lexicon)
|
||||
return new QueryFilterNoPass();
|
||||
|
||||
long offset = lexicon.wordOffset(termId);
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterNoPass();
|
||||
|
||||
return new ReverseIndexRetainFilter(getReader(offset), name, termId, budget);
|
||||
}
|
||||
|
||||
/** Create a filter step requiring the specified termId to be absent from the documents */
|
||||
public QueryFilterStepIf not(IndexLanguageContext languageContext, long termId, IndexSearchBudget budget) {
|
||||
var lexicon = languageContext.wordLexiconFull;
|
||||
if (null == lexicon)
|
||||
return new QueryFilterLetThrough();
|
||||
|
||||
long offset = lexicon.wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new QueryFilterLetThrough();
|
||||
|
||||
return new ReverseIndexRejectFilter(getReader(offset), budget);
|
||||
}
|
||||
|
||||
/** Return the number of documents with the termId in the index */
|
||||
public int numDocuments(IndexLanguageContext languageContext, long termId) {
|
||||
var lexicon = languageContext.wordLexiconFull;
|
||||
if (null == lexicon)
|
||||
return 0;
|
||||
|
||||
long offset = lexicon.wordOffset(termId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
|
||||
return getReader(offset).estimateSize();
|
||||
}
|
||||
|
||||
/** Create a BTreeReader for the document offset associated with a termId */
|
||||
private SkipListReader getReader(long offset) {
|
||||
return new SkipListReader(dataPool, offset);
|
||||
}
|
||||
|
||||
/** Get term metadata for each document, return an array of TermMetadataList of the same
|
||||
* length and order as termIds, with each list of the same length and order as docIds
|
||||
*
|
||||
* @throws TimeoutException if the read could not be queued in a timely manner;
|
||||
* (the read itself may still exceed the budgeted time)
|
||||
*/
|
||||
public TermMetadataList[] getTermData(Arena arena,
|
||||
IndexLanguageContext languageContext,
|
||||
IndexSearchBudget budget,
|
||||
long[] termIds,
|
||||
CombinedDocIdList docIds)
|
||||
throws TimeoutException
|
||||
{
|
||||
// Gather all termdata to be retrieved into a single array,
|
||||
// to help cluster related disk accesses and get better I/O performance
|
||||
|
||||
WordLexicon lexicon = languageContext.wordLexiconFull;
|
||||
if (null == lexicon) {
|
||||
TermMetadataList[] ret = new TermMetadataList[termIds.length];
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
ret[i] = new TermMetadataList(new TermData[docIds.size()]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
long[] offsetsAll = new long[termIds.length * docIds.size()];
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
long termId = termIds[i];
|
||||
long offset = lexicon.wordOffset(termId);
|
||||
|
||||
if (offset < 0) {
|
||||
// This is likely a bug in the code, but we can't throw an exception here.
|
||||
logger.debug("Missing offset for word {}", termId);
|
||||
|
||||
// We'll pass zero offsets to positionsFileReader.getTermData(), which will be
|
||||
// interpreted as an instruction to ignore these positions.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read the size and offset of the position data
|
||||
long[] offsetsForTerm = getReader(offset).getValueOffsets(docIds.array());
|
||||
|
||||
// Add to the big array of term data offsets
|
||||
System.arraycopy(offsetsForTerm, 0, offsetsAll, i * docIds.size(), docIds.size());
|
||||
}
|
||||
|
||||
// Perform the read
|
||||
TermData[] termDataCombined = positionsFileReader.getTermData(arena, budget, offsetsAll);
|
||||
|
||||
// Break the result data into separate arrays by termId again
|
||||
TermMetadataList[] ret = new TermMetadataList[termIds.length];
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
ret[i] = new TermMetadataList(
|
||||
Arrays.copyOfRange(termDataCombined, i*docIds.size(), (i+1)*docIds.size())
|
||||
);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
dataPool.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Error while closing bufferPool", e);
|
||||
}
|
||||
|
||||
if (documents != null)
|
||||
documents.close();
|
||||
|
||||
wordLexiconMap.values().forEach(WordLexicon::close);
|
||||
|
||||
if (positionsFileReader != null) {
|
||||
try {
|
||||
positionsFileReader.close();
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to close positions file reader", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public WordLexicon getWordLexicon(String languageIsoCode) {
|
||||
return wordLexiconMap.get(languageIsoCode);
|
||||
}
|
||||
}
|
@@ -0,0 +1,19 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class IndexLanguageContext {
|
||||
public final String languageIsoCode;
|
||||
|
||||
@Nullable
|
||||
final WordLexicon wordLexiconFull;
|
||||
|
||||
@Nullable
|
||||
final WordLexicon wordLexiconPrio;
|
||||
|
||||
public IndexLanguageContext(String languageIsoCode, WordLexicon wordLexiconFull, WordLexicon wordLexiconPrio) {
|
||||
this.languageIsoCode = languageIsoCode;
|
||||
this.wordLexiconFull = wordLexiconFull;
|
||||
this.wordLexiconPrio = wordLexiconPrio;
|
||||
}
|
||||
}
|
@@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index;
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.reverse.query.EntrySource;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
|
||||
@@ -13,7 +13,7 @@ import java.nio.channels.FileChannel;
|
||||
public class PrioIndexEntrySource implements EntrySource {
|
||||
private final String name;
|
||||
|
||||
private final ByteBuffer readData = ByteBuffer.allocate(1024);
|
||||
private final ByteBuffer readData = ByteBuffer.allocate(8*1024);
|
||||
private final BitReader bitReader = new BitReader(readData, this::fillReadBuffer);
|
||||
|
||||
private final FileChannel docsFileChannel;
|
||||
@@ -55,17 +55,13 @@ public class PrioIndexEntrySource implements EntrySource {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skip(int n) {
|
||||
throw new UnsupportedOperationException("Not implemented");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("preview")
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||
outputBuffer.clear();
|
||||
|
||||
// FYI: The encoding end of this compression algorithm is at PrioDocIdsTransformer
|
||||
while (outputBuffer.hasRemaining() && readItems++ < numItems) {
|
||||
int rank;
|
||||
int domainId;
|
@@ -1,71 +1,53 @@
|
||||
package nu.marginalia.index;
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.reverse.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.reverse.query.EntrySource;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PrioReverseIndexReader {
|
||||
private final LongArray words;
|
||||
private final long wordsDataOffset;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
private final String name;
|
||||
|
||||
private final Map<String, WordLexicon> wordLexiconMap;
|
||||
|
||||
|
||||
private final FileChannel documentsChannel;
|
||||
|
||||
public PrioReverseIndexReader(String name,
|
||||
Path words,
|
||||
List<WordLexicon> wordLexicons,
|
||||
Path documents) throws IOException {
|
||||
this.name = name;
|
||||
|
||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||
this.words = null;
|
||||
this.wordsBTreeReader = null;
|
||||
if (!Files.exists(documents)) {
|
||||
this.documentsChannel = null;
|
||||
this.wordsDataOffset = -1;
|
||||
this.wordLexiconMap = Map.of();
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Switching reverse index");
|
||||
|
||||
this.words = LongArrayFactory.mmapForReadingShared(words);
|
||||
|
||||
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
wordLexiconMap = wordLexicons.stream().collect(Collectors.toUnmodifiableMap(lexicon -> lexicon.languageIsoCode, v -> v));
|
||||
documentsChannel = (FileChannel) Files.newByteChannel(documents);
|
||||
|
||||
logger.info("Switching reverse index");
|
||||
}
|
||||
|
||||
/** Calculate the offset of the word in the documents.
|
||||
* If the return-value is negative, the term does not exist
|
||||
* in the index.
|
||||
*/
|
||||
long wordOffset(long termId) {
|
||||
long idx = wordsBTreeReader.findEntry(termId);
|
||||
|
||||
if (idx < 0)
|
||||
return -1L;
|
||||
|
||||
return words.get(wordsDataOffset + idx + 1);
|
||||
}
|
||||
|
||||
public EntrySource documents(long termId) {
|
||||
if (null == words) {
|
||||
public EntrySource documents(IndexLanguageContext languageContext, long termId) {
|
||||
if (languageContext.wordLexiconPrio == null) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
long offset = languageContext.wordLexiconPrio.wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return new EmptyEntrySource();
|
||||
@@ -76,10 +58,16 @@ public class PrioReverseIndexReader {
|
||||
termId);
|
||||
}
|
||||
|
||||
/** Return the number of documents with the termId in the index */
|
||||
public int numDocuments(long termId) {
|
||||
/**
|
||||
* Return the number of documents with the termId in the index
|
||||
*/
|
||||
public int numDocuments(IndexLanguageContext languageContext, long termId) {
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
var lexicon = languageContext.wordLexiconPrio;
|
||||
if (null == lexicon)
|
||||
return 0;
|
||||
|
||||
long offset = lexicon.wordOffset(termId);
|
||||
|
||||
if (offset < 0) // No documents
|
||||
return 0;
|
||||
@@ -87,8 +75,7 @@ public class PrioReverseIndexReader {
|
||||
ByteBuffer buffer = ByteBuffer.allocate(4);
|
||||
try {
|
||||
documentsChannel.read(buffer, offset);
|
||||
}
|
||||
catch (IOException e) {
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to read documents channel", e);
|
||||
return 0;
|
||||
}
|
||||
@@ -101,13 +88,15 @@ public class PrioReverseIndexReader {
|
||||
public void close() {
|
||||
try {
|
||||
documentsChannel.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to close documents channel", e);
|
||||
}
|
||||
|
||||
if (words != null)
|
||||
words.close();
|
||||
wordLexiconMap.values().forEach(WordLexicon::close);
|
||||
}
|
||||
|
||||
}
|
||||
@Nullable
|
||||
public WordLexicon getWordLexicon(String languageIsoCode) {
|
||||
return wordLexiconMap.get(languageIsoCode);
|
||||
}
|
||||
}
|
46
code/index/java/nu/marginalia/index/reverse/WordLexicon.java
Normal file
46
code/index/java/nu/marginalia/index/reverse/WordLexicon.java
Normal file
@@ -0,0 +1,46 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.ffi.LinuxSystemCalls;
|
||||
import nu.marginalia.index.config.ReverseIndexParameters;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class WordLexicon {
|
||||
public final String languageIsoCode;
|
||||
private final LongArray words;
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
private final long wordsDataOffset;
|
||||
|
||||
public WordLexicon(String languageIsoCode, Path fileName) throws IOException {
|
||||
this.languageIsoCode = languageIsoCode;
|
||||
|
||||
|
||||
this.words = LongArrayFactory.mmapForReadingShared(fileName);
|
||||
|
||||
LinuxSystemCalls.madviseRandom(this.words.getMemorySegment());
|
||||
|
||||
this.wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
this.wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
}
|
||||
|
||||
/** Calculate the offset of the word in the documents.
|
||||
* If the return-value is negative, the term does not exist
|
||||
* in the index.
|
||||
*/
|
||||
public long wordOffset(long termId) {
|
||||
long idx = wordsBTreeReader.findEntry(termId);
|
||||
|
||||
if (idx < 0)
|
||||
return -1L;
|
||||
|
||||
return words.get(wordsDataOffset + idx + 1);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
words.close();
|
||||
}
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.reverse.construction;
|
||||
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user