mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(refac) Clean up index code
This commit is contained in:
@@ -29,7 +29,6 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class IndexConstructorMain extends ProcessMainClass {
|
||||
@@ -39,7 +38,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexConstructorMain.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
static void main(String[] args) throws Exception {
|
||||
Instructions<CreateIndexRequest> instructions = null;
|
||||
try {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
@@ -83,7 +82,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
this.domainRankings = domainRankings;
|
||||
}
|
||||
|
||||
private void run(CreateIndexRequest instructions) throws SQLException, IOException {
|
||||
private void run(CreateIndexRequest instructions) throws IOException {
|
||||
heartbeat.start();
|
||||
|
||||
switch (instructions.indexName()) {
|
||||
|
@@ -5,7 +5,5 @@ import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
public class ReverseIndexParameters
|
||||
{
|
||||
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_512);
|
||||
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
|
||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_512);
|
||||
}
|
||||
|
@@ -1,109 +0,0 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class ReverseIndexSelfTest {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexSelfTest.class);
|
||||
public static void runSelfTest1(LongArray wordsDataRange, long wordsDataSize) {
|
||||
logger.info("Starting test 1");
|
||||
|
||||
if (!wordsDataRange.isSortedN(2, 0, wordsDataSize))
|
||||
logger.error("Failed test 1: Words data is not sorted");
|
||||
else
|
||||
logger.info("Passed test 1");
|
||||
}
|
||||
|
||||
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 2");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
|
||||
if (!docRange.isSortedN(2, 0, header.numEntries() * 2L)) {
|
||||
logger.error("Failed test 2: numEntries={}, offset={}", header.numEntries(), header.dataOffsetLongs());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Passed test 2");
|
||||
}
|
||||
|
||||
public static void runSelfTest3(LongArray wordsDataRange, BTreeReader reader) {
|
||||
logger.info("Starting test 3");
|
||||
for (long i = 0; i < wordsDataRange.size(); i+=2) {
|
||||
if (reader.findEntry(wordsDataRange.get(i)) < 0) {
|
||||
logger.error("Failed Test 3");
|
||||
return;
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 3");
|
||||
}
|
||||
|
||||
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 4");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
for (int j = 0; j < docRange.size(); j+=2) {
|
||||
if (docsBTreeReader.findEntry(docRange.get(j)) < 0) {
|
||||
logger.info("Failed test 4");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 4");
|
||||
}
|
||||
public static void runSelfTest5(LongArray wordsDataRange, BTreeReader wordsBTreeReader) {
|
||||
logger.info("Starting test 5");
|
||||
LongOpenHashSet words = new LongOpenHashSet((int)wordsDataRange.size()/2);
|
||||
for (int i = 0; i < wordsDataRange.size(); i+=2) {
|
||||
words.add(wordsDataRange.get(i));
|
||||
}
|
||||
var random = new Random();
|
||||
for (int i = 0; i < 100_000_000; i++) {
|
||||
long v;
|
||||
do {
|
||||
v = random.nextLong();
|
||||
} while (words.contains(v));
|
||||
if (wordsBTreeReader.findEntry(v) >= 0) {
|
||||
logger.error("Failed test 5 @ W{}", v);
|
||||
return;
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 5");
|
||||
}
|
||||
|
||||
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
|
||||
logger.info("Starting test 6");
|
||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||
var header = docsBTreeReader.getHeader();
|
||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||
Long prev = null;
|
||||
for (int j = 0; j < docRange.size(); j+=2) {
|
||||
if (prev == null) {
|
||||
prev = docRange.get(j);
|
||||
continue;
|
||||
}
|
||||
long thisVal = prev + 1;
|
||||
long nextVal = docRange.get(j);
|
||||
while (thisVal < nextVal) {
|
||||
if (docsBTreeReader.findEntry(thisVal) >= 0) {
|
||||
logger.info("Failed test 6 @ W{}:D{}", wordsDataRange.get(i-1), thisVal);
|
||||
return;
|
||||
}
|
||||
thisVal++;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info("Passed test 6");
|
||||
}
|
||||
}
|
@@ -1,50 +0,0 @@
|
||||
package nu.marginalia.index.reverse;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class ReverseIndexDebugTest {
|
||||
@Test
|
||||
@Disabled // this is a debugging utility
|
||||
public void debug() throws IOException {
|
||||
long problemWord = -7909917549851025932L;
|
||||
long problemDoc = 9079256848846028801L;
|
||||
|
||||
var words = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-words.dat"));
|
||||
var documents = LongArrayFactory.mmapForReadingConfined(Path.of("/home/vlofgren/Code/MarginaliaSearch/run/node-1/index/ir/rev-docs.dat"));
|
||||
|
||||
var wordsBTreeReader = new BTreeReader(words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||
var wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||
|
||||
long wordOffset = wordsBTreeReader.findEntry(problemWord);
|
||||
assertTrue(wordOffset >= 0);
|
||||
|
||||
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
|
||||
|
||||
// We find problemDoc even though it doesn't exist in the document range
|
||||
long docOffset = docsReader.findEntry(problemDoc);
|
||||
assertTrue(docOffset < 0);
|
||||
|
||||
// We know it doesn't exist because when we check, we can't find it,
|
||||
// either by iterating...
|
||||
var dataRange = docsReader.data();
|
||||
System.out.println(dataRange.size());
|
||||
for (int i = 0; i < dataRange.size(); i+=2) {
|
||||
|
||||
assertNotEquals(problemDoc, dataRange.get(i));
|
||||
}
|
||||
|
||||
// or by binary searching
|
||||
assertTrue(dataRange.binarySearchN(2, problemDoc, 0, dataRange.size()) < 0);
|
||||
|
||||
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user