mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(index) Make hash algorithm selection configurable, writer-side
This commit is contained in:
@@ -4,6 +4,7 @@ import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
@@ -87,8 +88,8 @@ class ForwardIndexConverterTest {
|
||||
List.of(),
|
||||
new byte[0],
|
||||
List.of()
|
||||
)
|
||||
);
|
||||
),
|
||||
new KeywordHasher.AsciiIsh());
|
||||
|
||||
|
||||
}
|
||||
|
@@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:processes:converting-process:model')
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
@@ -28,8 +28,6 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
private final VarintCodedSequenceArrayColumn.Writer spansWriter;
|
||||
private final ByteArrayColumn.Writer spanCodesWriter;
|
||||
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
|
||||
public IndexJournalSlopWriter(Path dir, int page) throws IOException {
|
||||
|
||||
super(dir, page);
|
||||
@@ -52,7 +50,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
spansWriter = IndexJournalPage.spans.create(this);
|
||||
}
|
||||
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection, KeywordHasher hasher) throws IOException {
|
||||
|
||||
combinedIdWriter.put(combinedId);
|
||||
featuresWriter.put(keywordsProjection.htmlFeatures());
|
||||
@@ -66,7 +64,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
// termIds are the special hashes of the keywords
|
||||
long[] termIds = new long[keywordsProjection.words().size()];
|
||||
for (int i = 0; i < termIds.length; i++) {
|
||||
termIds[i] = hash.hashKeyword(keywords.get(i));
|
||||
termIds[i] = hasher.hashKeyword(keywords.get(i));
|
||||
}
|
||||
|
||||
termIdsWriter.put(termIds);
|
||||
|
@@ -39,5 +39,6 @@ dependencies {
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
testImplementation project(':code:libraries:language-processing')
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
@@ -84,8 +85,8 @@ public class TestJournalFactory {
|
||||
Arrays.asList(positions),
|
||||
new byte[0],
|
||||
List.of()
|
||||
)
|
||||
);
|
||||
),
|
||||
new KeywordHasher.AsciiIsh());
|
||||
}
|
||||
writer.close();
|
||||
|
||||
@@ -121,8 +122,8 @@ public class TestJournalFactory {
|
||||
Arrays.asList(positions),
|
||||
new byte[0],
|
||||
List.of()
|
||||
)
|
||||
);
|
||||
),
|
||||
new KeywordHasher.AsciiIsh());
|
||||
|
||||
}
|
||||
writer.close();
|
||||
|
@@ -18,6 +18,7 @@ import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.query.IndexSearchBudget;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
@@ -339,7 +340,7 @@ public class CombinedIndexReaderTest {
|
||||
positions,
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
), new KeywordHasher.AsciiIsh());
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
|
@@ -20,6 +20,7 @@ import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
@@ -444,7 +445,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
positions,
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
), new KeywordHasher.AsciiIsh());
|
||||
|
||||
}
|
||||
|
||||
@@ -484,7 +485,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
positions,
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
), new KeywordHasher.AsciiIsh());
|
||||
|
||||
}
|
||||
|
||||
|
@@ -21,6 +21,7 @@ import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
@@ -569,7 +570,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
positions,
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
), new KeywordHasher.AsciiIsh());
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
|
@@ -5,6 +5,9 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.language.config.LanguageConfiguration;
|
||||
import nu.marginalia.language.keywords.KeywordHasher;
|
||||
import nu.marginalia.language.model.LanguageDefinition;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.slf4j.Logger;
|
||||
@@ -12,6 +15,8 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
@Singleton
|
||||
@@ -24,8 +29,15 @@ public class LoaderIndexJournalWriter {
|
||||
private long recordsWritten = 0;
|
||||
private int page;
|
||||
|
||||
private final Map<String, KeywordHasher> hasherByLanguage = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException {
|
||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService, LanguageConfiguration languageConfiguration) throws IOException {
|
||||
|
||||
for (LanguageDefinition languageDefinition: languageConfiguration.languages()) {
|
||||
hasherByLanguage.put(languageDefinition.isoCode(), languageDefinition.keywordHasher());
|
||||
}
|
||||
|
||||
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
|
||||
journalPath = IndexJournal.allocateName(indexArea);
|
||||
@@ -46,12 +58,15 @@ public class LoaderIndexJournalWriter {
|
||||
|
||||
public void putWords(long header, SlopDocumentRecord.KeywordsProjection data) throws IOException
|
||||
{
|
||||
KeywordHasher hasher = hasherByLanguage.get(data.language());
|
||||
if (null == hasher) return;
|
||||
|
||||
if (++recordsWritten > 200_000) {
|
||||
recordsWritten = 0;
|
||||
switchToNextVersion();
|
||||
}
|
||||
|
||||
currentWriter.put(header, data);
|
||||
currentWriter.put(header, data, hasher);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
Reference in New Issue
Block a user