1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

(encyclopedia) Fix memory issue in preconversion step

Use SimpleBlockingThreadPool pool instead of Java's Workstealing Pool as the latter causes runaway memory consumption in some circumstances, while SimpleBlockingThreadPool uses a bounded queue and always pushes back against the supplier if it can't hold any more tasks.
This commit is contained in:
Viktor Lofgren
2024-04-05 16:57:53 +02:00
parent e1151ecf2a
commit 448a941de2

View File

@@ -3,6 +3,7 @@ package nu.marginalia.encyclopedia;
import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.openzim.ZIMTypes.ZIMFile;
import org.openzim.ZIMTypes.ZIMReader;
import org.slf4j.LoggerFactory;
@@ -12,7 +13,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;
import java.util.function.Predicate;
@@ -25,36 +26,33 @@ public class EncyclopediaConverter {
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
var wc = new WikiCleaner();
var size = new AtomicInteger();
if (!Files.exists(inputFile)) {
throw new IllegalStateException("ZIM file not found: " + inputFile);
}
Files.deleteIfExists(outputFile);
try (var executor = Executors.newWorkStealingPool(Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32))) {
var pool = new SimpleBlockingThreadPool("converter-pool",
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
8);
var size = new AtomicInteger();
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
Predicate<Integer> keepGoing = (s) -> true;
if (!Files.exists(inputFile)) {
throw new IllegalStateException("ZIM file not found: " + inputFile);
}
Files.deleteIfExists(outputFile);
BiConsumer<String, String> handleArticle = (url, html) -> {
pool.submitQuietly(() -> {
int sz = size.incrementAndGet();
if (sz % 1000 == 0) {
System.out.printf("\u001b[2K\r%d", sz);
}
asw.add(wc.cleanWikiJunk(url, html));
});
};
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
Predicate<Integer> keepGoing = (s) -> true;
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
BiConsumer<String, String> handleArticle = (url, html) -> {
if (executor.isTerminated())
return;
executor.submit(() -> {
int sz = size.incrementAndGet();
if (sz % 1000 == 0) {
System.out.printf("\u001b[2K\r%d", sz);
}
asw.add(wc.cleanWikiJunk(url, html));
});
size.incrementAndGet();
};
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
}
pool.shutDown();
pool.awaitTermination(1, TimeUnit.DAYS);
}
}
}