1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(index) Make hash algorithm selection configurable, writer-side

This commit is contained in:
Viktor Lofgren
2025-09-01 12:03:01 +02:00
parent 42f043a60f
commit 946d64c8da
9 changed files with 37 additions and 17 deletions

View File

@@ -4,6 +4,7 @@ import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.process.control.FakeProcessHeartbeat;
@@ -87,8 +88,8 @@ class ForwardIndexConverterTest {
List.of(),
new byte[0],
List.of()
)
);
),
new KeywordHasher.AsciiIsh());
}

View File

@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:array')
implementation project(':code:common:model')
implementation project(':code:processes:converting-process:model')

View File

@@ -1,6 +1,6 @@
package nu.marginalia.index.journal;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable;
@@ -28,8 +28,6 @@ public class IndexJournalSlopWriter extends SlopTable {
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
private final ByteArrayColumn.Writer spanCodesWriter;
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
super(dir, page);
@@ -52,7 +50,7 @@ public class IndexJournalSlopWriter extends SlopTable {
spansWriter = IndexJournalPage.spans.create(this);
}
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection, KeywordHasher hasher) throws IOException {
combinedIdWriter.put(combinedId);
featuresWriter.put(keywordsProjection.htmlFeatures());
@@ -66,7 +64,7 @@ public class IndexJournalSlopWriter extends SlopTable {
// termIds are the special hashes of the keywords
long[] termIds = new long[keywordsProjection.words().size()];
for (int i = 0; i < termIds.length; i++) {
termIds[i] = hash.hashKeyword(keywords.get(i));
termIds[i] = hasher.hashKeyword(keywords.get(i));
}
termIdsWriter.put(termIds);

View File

@@ -39,5 +39,6 @@ dependencies {
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:libraries:language-processing')
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.IndexJournalPage;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.test.TestUtil;
@@ -84,8 +85,8 @@ public class TestJournalFactory {
Arrays.asList(positions),
new byte[0],
List.of()
)
);
),
new KeywordHasher.AsciiIsh());
}
writer.close();
@@ -121,8 +122,8 @@ public class TestJournalFactory {
Arrays.asList(positions),
new byte[0],
List.of()
)
);
),
new KeywordHasher.AsciiIsh());
}
writer.close();

View File

@@ -18,6 +18,7 @@ import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
@@ -339,7 +340,7 @@ public class CombinedIndexReaderTest {
positions,
new byte[0],
List.of()
));
), new KeywordHasher.AsciiIsh());
}
var linkdbWriter = new DocumentDbWriter(

View File

@@ -20,6 +20,7 @@ import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
@@ -444,7 +445,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
positions,
new byte[0],
List.of()
));
), new KeywordHasher.AsciiIsh());
}
@@ -484,7 +485,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
positions,
new byte[0],
List.of()
));
), new KeywordHasher.AsciiIsh());
}

View File

@@ -21,6 +21,7 @@ import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
@@ -569,7 +570,7 @@ public class IndexQueryServiceIntegrationTest {
positions,
new byte[0],
List.of()
));
), new KeywordHasher.AsciiIsh());
}
var linkdbWriter = new DocumentDbWriter(

View File

@@ -5,6 +5,9 @@ import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.language.config.LanguageConfiguration;
import nu.marginalia.language.keywords.KeywordHasher;
import nu.marginalia.language.model.LanguageDefinition;
import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.storage.FileStorageService;
import org.slf4j.Logger;
@@ -12,6 +15,8 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
@Singleton
@@ -24,8 +29,15 @@ public class LoaderIndexJournalWriter {
private long recordsWritten = 0;
private int page;
private final Map<String, KeywordHasher> hasherByLanguage = new HashMap<>();
@Inject
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException {
public LoaderIndexJournalWriter(FileStorageService fileStorageService, LanguageConfiguration languageConfiguration) throws IOException {
for (LanguageDefinition languageDefinition: languageConfiguration.languages()) {
hasherByLanguage.put(languageDefinition.isoCode(), languageDefinition.keywordHasher());
}
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
journalPath = IndexJournal.allocateName(indexArea);
@@ -46,12 +58,15 @@ public class LoaderIndexJournalWriter {
public void putWords(long header, SlopDocumentRecord.KeywordsProjection data) throws IOException
{
KeywordHasher hasher = hasherByLanguage.get(data.language());
if (null == hasher) return;
if (++recordsWritten > 200_000) {
recordsWritten = 0;
switchToNextVersion();
}
currentWriter.put(header, data);
currentWriter.put(header, data, hasher);
}
public void close() throws IOException {