mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
3 Commits
deploy-011
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 |
@@ -302,8 +302,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
||||
* order.
|
||||
* we want to enqueue domains that tend ro be large and have common top domains first,
|
||||
* but otherwise have a random order.
|
||||
* <p></p>
|
||||
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||
@@ -311,15 +311,13 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
* */
|
||||
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||
Random r = new Random();
|
||||
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
||||
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||
|
||||
for (var spec : records) {
|
||||
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
||||
randomOrder.put(spec.domain, r.nextInt());
|
||||
}
|
||||
|
||||
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0))
|
||||
return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
|
||||
.reversed()
|
||||
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
|
||||
public void waitFetchDelay(long spentTime) {
|
||||
long sleepTime = delayTime;
|
||||
|
||||
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||
try {
|
||||
if (sleepTime >= 1) {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||
} else {
|
||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||
// within sane limits. This means slower servers get slower crawling, and faster
|
||||
@@ -71,12 +73,12 @@ public class CrawlDelayTimer {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(sleepTime - spentTime);
|
||||
Thread.sleep(sleepTime - spentTime + jitter);
|
||||
}
|
||||
|
||||
if (slowDown) {
|
||||
// Additional delay when the server is signalling it wants slower requests
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
|
Reference in New Issue
Block a user