1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
1c2426a052 (crawler) Further rearrange crawl order
Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.
2025-03-27 11:19:20 +01:00
Viktor Lofgren
34df7441ac (crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests 2025-03-27 11:15:16 +01:00
2 changed files with 8 additions and 8 deletions

View File

@@ -302,8 +302,8 @@ public class CrawlerMain extends ProcessMainClass {
} }
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl, /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
* we want to enqueue domains that have common top domains first, but otherwise have a random * we want to enqueue domains that tend ro be large and have common top domains first,
* order. * but otherwise have a random order.
* <p></p> * <p></p>
* Note, we can't use hash codes for randomization as it is not desirable to have the same order * Note, we can't use hash codes for randomization as it is not desirable to have the same order
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
@@ -311,15 +311,13 @@ public class CrawlerMain extends ProcessMainClass {
* */ * */
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) { private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
Random r = new Random(); Random r = new Random();
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
Map<String, Integer> randomOrder = new HashMap<>(records.size()); Map<String, Integer> randomOrder = new HashMap<>(records.size());
for (var spec : records) { for (var spec : records) {
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
randomOrder.put(spec.domain, r.nextInt()); randomOrder.put(spec.domain, r.nextInt());
} }
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8) return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
.reversed() .reversed()
.thenComparing(spec -> randomOrder.get(spec.domain)) .thenComparing(spec -> randomOrder.get(spec.domain))
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to .thenComparing(Record::hashCode); // non-deterministic tie-breaker to

View File

@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import java.time.Duration; import java.time.Duration;
import java.util.concurrent.ThreadLocalRandom;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
public void waitFetchDelay(long spentTime) { public void waitFetchDelay(long spentTime) {
long sleepTime = delayTime; long sleepTime = delayTime;
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
try { try {
if (sleepTime >= 1) { if (sleepTime >= 1) {
if (spentTime > sleepTime) if (spentTime > sleepTime)
return; return;
Thread.sleep(min(sleepTime - spentTime, 5000)); Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
} else { } else {
// When no crawl delay is specified, lean toward twice the fetch+process time, // When no crawl delay is specified, lean toward twice the fetch+process time,
// within sane limits. This means slower servers get slower crawling, and faster // within sane limits. This means slower servers get slower crawling, and faster
@@ -71,12 +73,12 @@ public class CrawlDelayTimer {
if (spentTime > sleepTime) if (spentTime > sleepTime)
return; return;
Thread.sleep(sleepTime - spentTime); Thread.sleep(sleepTime - spentTime + jitter);
} }
if (slowDown) { if (slowDown) {
// Additional delay when the server is signalling it wants slower requests // Additional delay when the server is signalling it wants slower requests
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS); Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
} }
} }
catch (InterruptedException e) { catch (InterruptedException e) {