1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
be7d13ccce (crawler) Correct task execution logic in crawler
The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.
2025-03-09 13:47:51 +01:00
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
2 changed files with 6 additions and 14 deletions

View File

@@ -266,11 +266,11 @@ public class CrawlerMain extends ProcessMainClass {
// Start every task we currently can from the deferral list
deferredTasks.removeIf(task -> {
if (task.canRun()) {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
}
if (task.canRun()) {
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
@@ -280,7 +280,7 @@ public class CrawlerMain extends ProcessMainClass {
});
}
// Schedule any lingering tasks
// Schedule any lingering tasks for immediate execution
for (var task : deferredTasks) {
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
continue;

View File

@@ -60,15 +60,7 @@ public class HttpFetcherImpl implements HttpFetcher {
.cookieHandler(cookies)
.followRedirects(HttpClient.Redirect.NORMAL)
.connectTimeout(Duration.ofSeconds(8))
.executor(Executors.newCachedThreadPool(
r -> Thread.ofPlatform()
.name("FetcherClient")
.daemon(true)
.uncaughtExceptionHandler((t, ex) -> {
logger.error("Uncaught Exception in " + t.getName(), ex);
})
.start(r)
))
.executor(Executors.newCachedThreadPool())
.build();
}