1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(refac) Clean up index code

This commit is contained in:
Viktor Lofgren
2025-09-02 17:44:42 +02:00
parent fd1ac03c78
commit 52194cbe7a
4 changed files with 2 additions and 164 deletions

View File

@@ -29,7 +29,6 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.concurrent.TimeUnit;
public class IndexConstructorMain extends ProcessMainClass {
@@ -39,7 +38,7 @@ public class IndexConstructorMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
public static void main(String[] args) throws Exception {
static void main(String[] args) throws Exception {
Instructions<CreateIndexRequest> instructions = null;
try {
new org.mariadb.jdbc.Driver();
@@ -83,7 +82,7 @@ public class IndexConstructorMain extends ProcessMainClass {
this.domainRankings = domainRankings;
}
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
private void run(CreateIndexRequest instructions) throws IOException {
heartbeat.start();
switch (instructions.indexName()) {

View File

@@ -5,7 +5,5 @@ import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters
{
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_512);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
}

View File

@@ -1,109 +0,0 @@
package nu.marginalia.index.reverse;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Random;
public class ReverseIndexSelfTest {
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexSelfTest.class);
public static void runSelfTest1(LongArray wordsDataRange, long wordsDataSize) {
logger.info("Starting test 1");
if (!wordsDataRange.isSortedN(2, 0, wordsDataSize))
logger.error("Failed test 1: Words data is not sorted");
else
logger.info("Passed test 1");
}
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 2");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
if (!docRange.isSortedN(2, 0, header.numEntries() * 2L)) {
logger.error("Failed test 2: numEntries={}, offset={}", header.numEntries(), header.dataOffsetLongs());
return;
}
}
logger.info("Passed test 2");
}
public static void runSelfTest3(LongArray wordsDataRange, BTreeReader reader) {
logger.info("Starting test 3");
for (long i = 0; i < wordsDataRange.size(); i+=2) {
if (reader.findEntry(wordsDataRange.get(i)) < 0) {
logger.error("Failed Test 3");
return;
}
}
logger.info("Passed test 3");
}
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 4");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
for (int j = 0; j < docRange.size(); j+=2) {
if (docsBTreeReader.findEntry(docRange.get(j)) < 0) {
logger.info("Failed test 4");
return;
}
}
}
logger.info("Passed test 4");
}
public static void runSelfTest5(LongArray wordsDataRange, BTreeReader wordsBTreeReader) {
logger.info("Starting test 5");
LongOpenHashSet words = new LongOpenHashSet((int)wordsDataRange.size()/2);
for (int i = 0; i < wordsDataRange.size(); i+=2) {
words.add(wordsDataRange.get(i));
}
var random = new Random();
for (int i = 0; i < 100_000_000; i++) {
long v;
do {
v = random.nextLong();
} while (words.contains(v));
if (wordsBTreeReader.findEntry(v) >= 0) {
logger.error("Failed test 5 @ W{}", v);
return;
}
}
logger.info("Passed test 5");
}
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 6");
for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
Long prev = null;
for (int j = 0; j < docRange.size(); j+=2) {
if (prev == null) {
prev = docRange.get(j);
continue;
}
long thisVal = prev + 1;
long nextVal = docRange.get(j);
while (thisVal < nextVal) {
if (docsBTreeReader.findEntry(thisVal) >= 0) {
logger.info("Failed test 6 @ W{}:D{}", wordsDataRange.get(i-1), thisVal);
return;
}
thisVal++;
}
}
}
logger.info("Passed test 6");
}
}

View File

@@ -1,50 +0,0 @@
package nu.marginalia.index.reverse;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class ReverseIndexDebugTest {
@Test
@Disabled // this is a debugging utility
public void debug() throws IOException {
long problemWord = -7909917549851025932L;
long problemDoc = 9079256848846028801L;
var words = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-words.dat"));
var documents = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-docs.dat"));
var wordsBTreeReader = new BTreeReader(words, ReverseIndexParameters.wordsBTreeContext, 0);
var wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
long wordOffset = wordsBTreeReader.findEntry(problemWord);
assertTrue(wordOffset >= 0);
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
// We find problemDoc even though it doesn't exist in the document range
long docOffset = docsReader.findEntry(problemDoc);
assertTrue(docOffset < 0);
// We know it doesn't exist because when we check, we can't find it,
// either by iterating...
var dataRange = docsReader.data();
System.out.println(dataRange.size());
for (int i = 0; i < dataRange.size(); i+=2) {
assertNotEquals(problemDoc, dataRange.get(i));
}
// or by binary searching
assertTrue(dataRange.binarySearchN(2, problemDoc, 0, dataRange.size()) < 0);
}
}