(crawler) Add error logging when entering bad path for historical crawl data

(common) Log info about metrics server
2025-10-06 07:32:38 +02:00 · 2025-03-10 13:38:40 +01:00 · 2025-03-10 13:12:39 +01:00 · 2025-03-10 13:03:48 +01:00 · 2025-03-09 13:47:51 +01:00 · 2025-03-09 11:50:52 +01:00
5 changed files with 17 additions and 19 deletions
--- a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
+++ b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {

        while (nets.hasMoreElements()) {
            NetworkInterface netif = nets.nextElement();
+            logger.info("Considering network interface {}:  Up? {},  Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
            if (!netif.isUp() || netif.isLoopback()) {
                continue;
            }
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
            Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
            while (inetAddresses.hasMoreElements()) {
                InetAddress addr = inetAddresses.nextElement();
+                logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
                if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
                    return addr.getHostAddress();
                }
--- a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
+++ b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;

 public class MetricsServer {

-    private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
+    private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);

    @Inject
    public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {

            context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");

+            logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
+
            server.start();
        }
        catch (Exception|NoSuchMethodError ex) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -266,11 +266,11 @@ public class CrawlerMain extends ProcessMainClass {

                // Start every task we currently can from the deferral list
                deferredTasks.removeIf(task -> {
-                    if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
-                        return true; // task has already run, duplicate in crawl specs
-                    }
-
                    if (task.canRun()) {
+                        if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
+                            return true; // task has already run, duplicate in crawl specs
+                        }
+
                        // This blocks the caller when the pool is full
                        pool.submitQuietly(task);
                        return true;
@@ -280,7 +280,7 @@ public class CrawlerMain extends ProcessMainClass {
                });
            }

-            // Schedule any lingering tasks
+            // Schedule any lingering tasks for immediate execution
            for (var task : deferredTasks) {
                if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
                    continue;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -60,15 +60,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                .cookieHandler(cookies)
                .followRedirects(HttpClient.Redirect.NORMAL)
                .connectTimeout(Duration.ofSeconds(8))
-                .executor(Executors.newCachedThreadPool(
-                        r -> Thread.ofPlatform()
-                                .name("FetcherClient")
-                                .daemon(true)
-                                .uncaughtExceptionHandler((t, ex) -> {
-                                    logger.error("Uncaught Exception in " + t.getName(), ex);
-                                })
-                                .start(r)
-                ))
+                .executor(Executors.newCachedThreadPool())
                .build();
    }

--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
    {

        String fileName = fullPath.getFileName().toString();
-        if (fileName.endsWith(".parquet")) {
+
+        if (fileName.endsWith(".slop.zip")) {
            try {
-                return new ParquetSerializableCrawlDataStream(fullPath);
+                return new SlopSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
            }
        }

-        if (fileName.endsWith(".slop.zip")) {
+        else if (fileName.endsWith(".parquet")) {
+            logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
            try {
-                return new SlopSerializableCrawlDataStream(fullPath);
+                return new ParquetSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
                logger.error("Error reading domain data from " + fullPath, ex);
                return SerializableCrawlDataStream.empty();
Author	SHA1	Message	Date
Viktor Lofgren	78cc25584a	(crawler) Add error logging when entering bad path for historical crawl data	2025-03-10 13:38:40 +01:00
Viktor Lofgren	62ba30bacf	(common) Log info about metrics server	2025-03-10 13:12:39 +01:00
Viktor Lofgren	3bb84eb206	(common) Log info about metrics server	2025-03-10 13:03:48 +01:00
Viktor Lofgren	be7d13ccce	(crawler) Correct task execution logic in crawler The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.	2025-03-09 13:47:51 +01:00
Viktor Lofgren	8c088a7c0b	(crawler) Remove custom thread factory This was causing issues, and not really doing much of benefit.	2025-03-09 11:50:52 +01:00