mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
2 Commits
deploy-011
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
1c2426a052 | ||
|
34df7441ac |
@@ -302,8 +302,8 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||||
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
* we want to enqueue domains that tend ro be large and have common top domains first,
|
||||||
* order.
|
* but otherwise have a random order.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||||
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||||
@@ -311,15 +311,13 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
* */
|
* */
|
||||||
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||||
Random r = new Random();
|
Random r = new Random();
|
||||||
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
|
||||||
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||||
|
|
||||||
for (var spec : records) {
|
for (var spec : records) {
|
||||||
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
|
||||||
randomOrder.put(spec.domain, r.nextInt());
|
randomOrder.put(spec.domain, r.nextInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
|
return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
|
||||||
.reversed()
|
.reversed()
|
||||||
.thenComparing(spec -> randomOrder.get(spec.domain))
|
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||||
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
|
|||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||||
try {
|
try {
|
||||||
if (sleepTime >= 1) {
|
if (sleepTime >= 1) {
|
||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||||
} else {
|
} else {
|
||||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||||
// within sane limits. This means slower servers get slower crawling, and faster
|
// within sane limits. This means slower servers get slower crawling, and faster
|
||||||
@@ -71,12 +73,12 @@ public class CrawlDelayTimer {
|
|||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(sleepTime - spentTime);
|
Thread.sleep(sleepTime - spentTime + jitter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slowDown) {
|
if (slowDown) {
|
||||||
// Additional delay when the server is signalling it wants slower requests
|
// Additional delay when the server is signalling it wants slower requests
|
||||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException e) {
|
catch (InterruptedException e) {
|
||||||
|
Reference in New Issue
Block a user