1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

3 Commits

Author SHA1 Message Date
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
Viktor Lofgren
ea9a642b9b (crawler) More effective task scheduling in the crawler
This should hopefully allow more threads to be busy
2025-03-09 11:44:59 +01:00
Viktor Lofgren
27f528af6a (search) Fix "Remove Javascript" toggle
A bug was introduced at some point where the special keyword for filtering on javascript was changed to special:scripts, from js:true/js:false.

Solves issue #155
2025-02-28 12:03:04 +01:00
5 changed files with 50 additions and 18 deletions

View File

@@ -41,10 +41,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.Security;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
// (this happens when the process is restarted after a crash or a shutdown)
tasksDone.set(workLog.countFinishedJobs());
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
// this will more aggressively attempt to schedule the jobs to avoid blocking
List<CrawlTask> deferredTasks = new LinkedList<>();
// Create crawl tasks and submit them to the pool for execution
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
if (workLog.isJobFinished(crawlSpec.domain()))
continue;
var task = new CrawlTask(
// Add to the end of the deferral list
deferredTasks.addLast(new CrawlTask(
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
workLog));
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
pool.submitQuietly(task);
}
// Start every task we currently can from the deferral list
deferredTasks.removeIf(task -> {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
}
if (task.canRun()) {
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
}
return false;
});
}
// Schedule any lingering tasks
for (var task : deferredTasks) {
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
continue;
pool.submitQuietly(task);
}
logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
this.id = Integer.toHexString(domain.hashCode());
}
/** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */
public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain));
}
@Override
public void run() throws Exception {

View File

@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever();
}
/** Recursively fetch sitemaps */
@Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) {
switch (fetchSingleSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
}
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder()
.GET()
.uri(sitemapUrl.asURI())

View File

@@ -44,6 +44,14 @@ public class DomainLocks {
return new Semaphore(2);
}
public boolean canLock(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem)
return true;
else
return sem.availablePermits() > 0;
}
public static class DomainLock implements AutoCloseable {
private final String domainName;
private final Semaphore semaphore;

View File

@@ -7,8 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
DENY_JS("no-js", "special:scripts");
public final String value;
public final String[] implictExcludeSearchTerms;
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}

View File

@@ -7,9 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
DENY_JS("no-js", "special:scripts");
public final String value;
public final String[] implictExcludeSearchTerms;
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}