mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
4 Commits
deploy-011
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b |
@@ -35,21 +35,8 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forExpensiveRequest() {
|
||||
return new RateLimiter(5, 10);
|
||||
}
|
||||
|
||||
public static RateLimiter custom(int perMinute) {
|
||||
return new RateLimiter(perMinute, 60);
|
||||
}
|
||||
|
||||
public static RateLimiter forSpamBots() {
|
||||
return new RateLimiter(120, 3600);
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forLogin() {
|
||||
return new RateLimiter(3, 15);
|
||||
return new RateLimiter(4 * perMinute, perMinute);
|
||||
}
|
||||
|
||||
private void cleanIdleBuckets() {
|
||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
private Bucket createBucket() {
|
||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
||||
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||
var bw = Bandwidth.classic(capacity, refill);
|
||||
return Bucket.builder().addLimit(bw).build();
|
||||
}
|
||||
|
@@ -501,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return new CrawlDataReference(slopPath);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||
}
|
||||
|
||||
|
@@ -160,8 +160,15 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Fetch sitemaps
|
||||
for (var sitemap : robotsRules.getSitemaps()) {
|
||||
|
||||
// Validate the sitemap URL and check if it belongs to the domain as the root URL
|
||||
if (EdgeUrl.parse(sitemap)
|
||||
.map(url -> url.getDomain().equals(rootUrl.domain))
|
||||
.orElse(false)) {
|
||||
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
}
|
||||
}
|
||||
|
||||
int crawlerAdditions = 0;
|
||||
|
||||
|
@@ -10,12 +10,10 @@ import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.HashMap;
|
||||
@@ -29,6 +27,8 @@ class WarcRecorderFakeServerTest {
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException {
|
||||
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
|
||||
|
||||
// This endpoint will finish sending the response immediately
|
||||
server.createContext("/fast", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
|
||||
@@ -40,6 +40,8 @@ class WarcRecorderFakeServerTest {
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
// This endpoint will take 10 seconds to finish sending the response,
|
||||
// which should trigger a timeout in the client
|
||||
server.createContext("/slow", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
|
||||
@@ -88,7 +90,7 @@ class WarcRecorderFakeServerTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchFast() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
public void fetchFast() throws Exception {
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("http://localhost:14510/fast"))
|
||||
@@ -114,7 +116,7 @@ class WarcRecorderFakeServerTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchSlow() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
public void fetchSlow() throws Exception {
|
||||
Instant start = Instant.now();
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
@@ -141,9 +143,10 @@ class WarcRecorderFakeServerTest {
|
||||
|
||||
System.out.println(sampleData);
|
||||
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond, so we expect the request to take 1s and change
|
||||
// before it times out.
|
||||
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000, "Request should take less than 2 seconds");
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||
// so we expect the request to take 1s and change before it times out.
|
||||
|
||||
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000);
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user