mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
(encyclopedia) Fix memory issue in preconversion step
Use SimpleBlockingThreadPool pool instead of Java's Workstealing Pool as the latter causes runaway memory consumption in some circumstances, while SimpleBlockingThreadPool uses a bounded queue and always pushes back against the supplier if it can't hold any more tasks.
This commit is contained in:
@@ -3,6 +3,7 @@ package nu.marginalia.encyclopedia;
|
||||
import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
|
||||
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
|
||||
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
import org.openzim.ZIMTypes.ZIMReader;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -12,7 +13,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Predicate;
|
||||
@@ -25,36 +26,33 @@ public class EncyclopediaConverter {
|
||||
|
||||
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
|
||||
var wc = new WikiCleaner();
|
||||
var size = new AtomicInteger();
|
||||
if (!Files.exists(inputFile)) {
|
||||
throw new IllegalStateException("ZIM file not found: " + inputFile);
|
||||
}
|
||||
Files.deleteIfExists(outputFile);
|
||||
|
||||
try (var executor = Executors.newWorkStealingPool(Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32))) {
|
||||
var pool = new SimpleBlockingThreadPool("converter-pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
|
||||
8);
|
||||
|
||||
var size = new AtomicInteger();
|
||||
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
|
||||
Predicate<Integer> keepGoing = (s) -> true;
|
||||
|
||||
if (!Files.exists(inputFile)) {
|
||||
throw new IllegalStateException("ZIM file not found: " + inputFile);
|
||||
}
|
||||
Files.deleteIfExists(outputFile);
|
||||
BiConsumer<String, String> handleArticle = (url, html) -> {
|
||||
pool.submitQuietly(() -> {
|
||||
int sz = size.incrementAndGet();
|
||||
if (sz % 1000 == 0) {
|
||||
System.out.printf("\u001b[2K\r%d", sz);
|
||||
}
|
||||
asw.add(wc.cleanWikiJunk(url, html));
|
||||
});
|
||||
};
|
||||
|
||||
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
|
||||
Predicate<Integer> keepGoing = (s) -> true;
|
||||
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
|
||||
|
||||
BiConsumer<String, String> handleArticle = (url, html) -> {
|
||||
if (executor.isTerminated())
|
||||
return;
|
||||
|
||||
executor.submit(() -> {
|
||||
int sz = size.incrementAndGet();
|
||||
if (sz % 1000 == 0) {
|
||||
System.out.printf("\u001b[2K\r%d", sz);
|
||||
}
|
||||
asw.add(wc.cleanWikiJunk(url, html));
|
||||
});
|
||||
|
||||
size.incrementAndGet();
|
||||
};
|
||||
|
||||
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
|
||||
}
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(1, TimeUnit.DAYS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user