1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

5 Commits

Author SHA1 Message Date
Viktor Lofgren
78cc25584a (crawler) Add error logging when entering bad path for historical crawl data 2025-03-10 13:38:40 +01:00
Viktor Lofgren
62ba30bacf (common) Log info about metrics server 2025-03-10 13:12:39 +01:00
Viktor Lofgren
3bb84eb206 (common) Log info about metrics server 2025-03-10 13:03:48 +01:00
Viktor Lofgren
be7d13ccce (crawler) Correct task execution logic in crawler
The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.
2025-03-09 13:47:51 +01:00
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
5 changed files with 17 additions and 19 deletions

View File

@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
while (nets.hasMoreElements()) {
NetworkInterface netif = nets.nextElement();
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
if (!netif.isUp() || netif.isLoopback()) {
continue;
}
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
while (inetAddresses.hasMoreElements()) {
InetAddress addr = inetAddresses.nextElement();
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
return addr.getHostAddress();
}

View File

@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject
public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
server.start();
}
catch (Exception|NoSuchMethodError ex) {

View File

@@ -266,11 +266,11 @@ public class CrawlerMain extends ProcessMainClass {
// Start every task we currently can from the deferral list
deferredTasks.removeIf(task -> {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
}
if (task.canRun()) {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
}
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
@@ -280,7 +280,7 @@ public class CrawlerMain extends ProcessMainClass {
});
}
// Schedule any lingering tasks
// Schedule any lingering tasks for immediate execution
for (var task : deferredTasks) {
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
continue;

View File

@@ -60,15 +60,7 @@ public class HttpFetcherImpl implements HttpFetcher {
.cookieHandler(cookies)
.followRedirects(HttpClient.Redirect.NORMAL)
.connectTimeout(Duration.ofSeconds(8))
.executor(Executors.newCachedThreadPool(
r -> Thread.ofPlatform()
.name("FetcherClient")
.daemon(true)
.uncaughtExceptionHandler((t, ex) -> {
logger.error("Uncaught Exception in " + t.getName(), ex);
})
.start(r)
))
.executor(Executors.newCachedThreadPool())
.build();
}

View File

@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
{
String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) {
if (fileName.endsWith(".slop.zip")) {
try {
return new ParquetSerializableCrawlDataStream(fullPath);
return new SlopSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();
}
}
if (fileName.endsWith(".slop.zip")) {
else if (fileName.endsWith(".parquet")) {
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
try {
return new SlopSerializableCrawlDataStream(fullPath);
return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();