1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
2e3f1313c7 (crawler) Log exceptions while crawling in crawler audit log 2025-04-17 21:18:09 +02:00
Viktor Lofgren
58e6f141ce (crawler) Reduce congestion throttle go-rate 2025-04-17 20:36:58 +02:00
Viktor Lofgren
500f63e921 (crawler) Lower max conn per route 2025-04-17 18:36:16 +02:00
Viktor Lofgren
6dfbedda1e (crawler) Increase max conn per route and connection timeout 2025-04-17 18:31:46 +02:00
Viktor Lofgren
9715ddb105 (crawler) Increase max pool size to a large value 2025-04-17 18:22:58 +02:00
Viktor Lofgren
1fc6313a77 (crawler) Remove log noise when retrying a bad URL 2025-04-17 17:10:46 +02:00
Viktor Lofgren
b1249d5b8a (crawler) Fix broken test. 2025-04-17 17:01:42 +02:00
3 changed files with 6 additions and 6 deletions

View File

@@ -80,11 +80,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private CloseableHttpClient createClient() throws NoSuchAlgorithmException { private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
final ConnectionConfig connectionConfig = ConnectionConfig.custom() final ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setSocketTimeout(10, TimeUnit.SECONDS) .setSocketTimeout(10, TimeUnit.SECONDS)
.setConnectTimeout(10, TimeUnit.SECONDS) .setConnectTimeout(30, TimeUnit.SECONDS)
.build(); .build();
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create() final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(4) .setMaxConnPerRoute(2)
.setMaxConnTotal(5000)
.setDefaultConnectionConfig(connectionConfig) .setDefaultConnectionConfig(connectionConfig)
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault())) .setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
.build(); .build();
@@ -418,7 +419,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url); case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url); case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url); case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url); case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url); case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url); case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
} }
@@ -613,7 +614,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override @Override
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) { public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
logger.info("Error", exception);
return TimeValue.ofSeconds(1); return TimeValue.ofSeconds(1);
} }

View File

@@ -53,7 +53,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private final CrawlerRevisitor crawlerRevisitor; private final CrawlerRevisitor crawlerRevisitor;
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle( private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them Duration.ofMillis(50) // pace the connections to avoid network congestion at startup
); );
int errorCount = 0; int errorCount = 0;

View File

@@ -101,7 +101,7 @@ class ContentTypeProberTest {
System.out.println(result); System.out.println(result);
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Ok(htmlEndpoint)); assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Redirect(htmlEndpoint));
} }
@Test @Test