1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

4 Commits

Author SHA1 Message Date
Viktor Lofgren
c91af247e9 (rate-limit) Fix rate limiting logic
The rate limiter was misconfigured to regenerate tokens at a fixed rate of 1 per refillRate; not refillRate per minute.  Additionally increasing the default bucket size to 4x refill rate.
2025-04-05 12:26:26 +02:00
Viktor Lofgren
7a31227de1 (crawler) Filter out robots.txt-sitemaps that belong to different domains 2025-04-02 13:35:37 +02:00
Viktor Lofgren
4f477604c5 (crawler) Improve error handling in parquet->slop conversion
Parquet code throws a RuntimeException, which was not correctly caught, leading to a failure to crawl.
2025-04-02 13:16:01 +02:00
Viktor Lofgren
2970f4395b (minor) Test code cleanup 2025-04-02 13:16:01 +02:00
4 changed files with 21 additions and 24 deletions

View File

@@ -35,21 +35,8 @@ public class RateLimiter {
}
public static RateLimiter forExpensiveRequest() {
return new RateLimiter(5, 10);
}
public static RateLimiter custom(int perMinute) {
return new RateLimiter(perMinute, 60);
}
public static RateLimiter forSpamBots() {
return new RateLimiter(120, 3600);
}
public static RateLimiter forLogin() {
return new RateLimiter(3, 15);
return new RateLimiter(4 * perMinute, perMinute);
}
private void cleanIdleBuckets() {
@@ -62,7 +49,7 @@ public class RateLimiter {
}
private Bucket createBucket() {
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
var bw = Bandwidth.classic(capacity, refill);
return Bucket.builder().addLimit(bw).build();
}

View File

@@ -501,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
return new CrawlDataReference(slopPath);
}
} catch (IOException e) {
} catch (Exception e) {
logger.debug("Failed to read previous crawl data for {}", specification.domain());
}

View File

@@ -160,8 +160,15 @@ public class CrawlerRetreiver implements AutoCloseable {
// Fetch sitemaps
for (var sitemap : robotsRules.getSitemaps()) {
// Validate the sitemap URL and check if it belongs to the domain as the root URL
if (EdgeUrl.parse(sitemap)
.map(url -> url.getDomain().equals(rootUrl.domain))
.orElse(false)) {
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
}
}
int crawlerAdditions = 0;

View File

@@ -10,12 +10,10 @@ import org.netpreserve.jwarc.WarcResponse;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
@@ -29,6 +27,8 @@ class WarcRecorderFakeServerTest {
@BeforeAll
public static void setUpAll() throws IOException {
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
// This endpoint will finish sending the response immediately
server.createContext("/fast", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
@@ -40,6 +40,8 @@ class WarcRecorderFakeServerTest {
exchange.close();
});
// This endpoint will take 10 seconds to finish sending the response,
// which should trigger a timeout in the client
server.createContext("/slow", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
@@ -88,7 +90,7 @@ class WarcRecorderFakeServerTest {
}
@Test
void fetchFast() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
public void fetchFast() throws Exception {
client.fetch(httpClient,
HttpRequest.newBuilder()
.uri(new java.net.URI("http://localhost:14510/fast"))
@@ -114,7 +116,7 @@ class WarcRecorderFakeServerTest {
}
@Test
void fetchSlow() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
public void fetchSlow() throws Exception {
Instant start = Instant.now();
client.fetch(httpClient,
HttpRequest.newBuilder()
@@ -141,9 +143,10 @@ class WarcRecorderFakeServerTest {
System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond, so we expect the request to take 1s and change
// before it times out.
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000, "Request should take less than 2 seconds");
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
// so we expect the request to take 1s and change before it times out.
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000);
}
}