(converter) Reduce lock contention in converter by separating the processing of full and simple-track domains

(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter
(converter) Adding some logging around the simple processing track to investigate an issue with the converter stalling
2025-10-06 07:32:38 +02:00 · 2025-01-26 13:18:14 +01:00 · 2025-01-26 12:52:54 +01:00 · 2025-01-26 12:02:00 +01:00 · 2025-01-24 18:51:41 +01:00 · 2025-01-24 18:50:00 +01:00
32 changed files with 199 additions and 173 deletions
--- a/build.gradle
+++ b/build.gradle
@@ -5,7 +5,7 @@ plugins {

    // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
    // https://github.com/GoogleContainerTools/jib/issues/3347
-    id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
+    id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
 }

 group 'marginalia'
--- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java
@@ -155,8 +155,15 @@ public class SentenceExtractor {
    public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
        String[] sentences;

-        // Normalize spaces
+        // Safety net against malformed data DOS attacks,
+        // found 5+ MB <p>-tags in the wild that just break
+        // the sentence extractor causing it to stall forever.
+        if (text.length() > 50_000) {
+            // 50k chars can hold a small novel, let alone single html tags
+            text = text.substring(0, 50_000);
+        }

+        // Normalize spaces
        text = normalizeSpaces(text);

        // Split into sentences
--- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
@@ -12,6 +12,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
 import nu.marginalia.converting.writer.ConverterBatchWritableIf;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.converting.writer.ConverterWriter;
+import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.mq.MessageQueueFactory;
 import nu.marginalia.mqapi.converting.ConvertRequest;
 import nu.marginalia.process.ProcessConfiguration;
@@ -49,6 +50,7 @@ public class ConverterMain extends ProcessMainClass {
    private final ProcessHeartbeat heartbeat;
    private final FileStorageService fileStorageService;
    private final SideloadSourceFactory sideloadSourceFactory;
+    private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);

    public static void main(String... args) throws Exception {

@@ -199,12 +201,19 @@ public class ConverterMain extends ProcessMainClass {
            processedDomains.set(batchingWorkLog.size());
            heartbeat.setProgress(processedDomains.get() / (double) totalDomains);

-            for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(),
+            logger.info("Processing small items");
+
+            // First process the small items
+            for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
            {
+                if (CrawledDomainReader.sizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
+                    continue;
+                }
+
                pool.submit(() -> {
-                    try {
-                        ConverterBatchWritableIf writable = processor.createWritable(domain);
+                    try (var dataStream = CrawledDomainReader.createDataStream(dataPath)) {
+                        ConverterBatchWritableIf writable = processor.fullProcessing(dataStream) ;
                        converterWriter.accept(writable);
                    }
                    catch (Exception ex) {
@@ -223,6 +232,31 @@ public class ConverterMain extends ProcessMainClass {
            do {
                System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
            } while (!pool.awaitTermination(60, TimeUnit.SECONDS));
+
+            logger.info("Processing large items");
+
+            // Next the big items domain-by-domain
+            for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
+                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
+            {
+                int sizeHint = CrawledDomainReader.sizeHint(dataPath);
+                if (sizeHint < SIDELOAD_THRESHOLD) {
+                    continue;
+                }
+
+                try (var dataStream = CrawledDomainReader.createDataStream(dataPath)) {
+                    ConverterBatchWritableIf writable = processor.simpleProcessing(dataStream, sizeHint);
+                    converterWriter.accept(writable);
+                }
+                catch (Exception ex) {
+                    logger.info("Error in processing", ex);
+                }
+                finally {
+                    heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
+                }
+            }
+
+            logger.info("Processing complete");
        }
    }

--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java
@@ -14,7 +14,6 @@ import nu.marginalia.converting.writer.ConverterBatchWritableIf;
 import nu.marginalia.converting.writer.ConverterBatchWriter;
 import nu.marginalia.geoip.GeoIpDictionary;
 import nu.marginalia.geoip.sources.AsnTable;
-import nu.marginalia.io.CrawledDomainReader;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.crawl.DomainIndexingState;
@@ -28,13 +27,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
-import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.regex.Pattern;

 public class DomainProcessor {
-    private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
    private final DocumentProcessor documentProcessor;
    private final SiteWords siteWords;
    private final AnchorTagsSource anchorTagsSource;
@@ -56,21 +53,6 @@ public class DomainProcessor {
        geoIpDictionary.waitReady();
    }

-    public ConverterBatchWritableIf createWritable(Path path) throws IOException {
-
-        var dataStream = CrawledDomainReader.createDataStream(path);
-
-        final int sizeHint = dataStream.sizeHint();
-
-        if (sizeHint > SIDELOAD_THRESHOLD) {
-            // If the file is too big, we run a processing mode that doesn't
-            // require loading the entire dataset into RAM
-            return simpleProcessing(dataStream, sizeHint);
-        }
-
-        return fullProcessing(dataStream);
-    }
-
    public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
        try {
            return new SimpleProcessing(dataStream, sizeHint, extraKeywords);
@@ -159,6 +141,7 @@ public class DomainProcessor {
        private final Set<String> processedUrls = new HashSet<>();
        private final DomainLinks externalDomainLinks;
        private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
+
        private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
                Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
        );
@@ -194,6 +177,7 @@ public class DomainProcessor {
        @Override
        public Iterator<ProcessedDocument> getDocumentsStream() {
            return iteratorFactory.create((taskConsumer) -> {
+
                while (dataStream.hasNext())
                {
                    if (!(dataStream.next() instanceof CrawledDocument doc))
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -28,13 +28,11 @@ import nu.marginalia.process.ProcessConfigurationModule;
 import nu.marginalia.process.ProcessMainClass;
 import nu.marginalia.process.control.ProcessHeartbeatImpl;
 import nu.marginalia.process.log.WorkLog;
-import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.service.module.DatabaseModule;
 import nu.marginalia.slop.SlopCrawlDataRecord;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorageId;
 import nu.marginalia.util.SimpleBlockingThreadPool;
-import org.apache.logging.log4j.util.Strings;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -44,11 +42,13 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.security.Security;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.function.Function;

 import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;

@@ -182,8 +182,6 @@ public class CrawlerMain extends ProcessMainClass {
        // Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
        // to be crawled.

-        performMigration(outputDir);
-
        try (var conn = dataSource.getConnection()) {
            try (var assignFreeDomains = conn.prepareStatement(
                    """
@@ -417,11 +415,22 @@ public class CrawlerMain extends ProcessMainClass {

        private CrawlDataReference getReference() {
            try {
-                return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
+                Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
+                if (Files.exists(slopPath)) {
+                    return new CrawlDataReference(CrawledDomainReader.createDataStream(slopPath));
+                }
+
+                Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
+                if (Files.exists(parquetPath)) {
+                    slopPath = migrateParquetData(parquetPath, domain, outputDir);
+                    return new CrawlDataReference(CrawledDomainReader.createDataStream(slopPath));
+                }
+
            } catch (IOException e) {
                logger.debug("Failed to read previous crawl data for {}", specification.domain());
-                return new CrawlDataReference();
            }
+
+            return new CrawlDataReference();
        }

    }
@@ -482,92 +491,19 @@ public class CrawlerMain extends ProcessMainClass {
        }
    }

-    // Data migration logic
-
-    private void performMigration(Path root) throws IOException {
-        Path crawlerLog = root.resolve("crawler.log");
-        Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
-
-
-        int finishedTasks = 0;
-        int totalTasks;
-        try (var oldLog = new WorkLog(crawlerLog)) {
-            totalTasks = oldLog.countFinishedJobs();
+    // Migrate from parquet to slop if necessary
+    //
+    // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
+    private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
+        if (!inputPath.endsWith(".parquet")) {
+            return inputPath;
        }

-        try (WorkLog workLog = new WorkLog(newCrawlerLog);
-            var migrationHeartbeat = heartbeat.createAdHocTaskHeartbeat("MIGRATING")) {
+        Path outputFile = CrawlerOutputFile.createSlopPath(crawlDataRoot, Integer.toHexString(domain.hashCode()), domain);

+        SlopCrawlDataRecord.convertFromParquet(inputPath, outputFile);

-
-            for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
-
-                var entry = item.getKey();
-                var path = item.getValue();
-
-                if (path.toFile().getName().endsWith(".parquet")) {
-                    logger.info("Converting {}", entry.id());
-
-                    String domain = entry.id();
-                    String id = Integer.toHexString(domain.hashCode());
-
-                    Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
-
-                    SlopCrawlDataRecord.convertFromParquet(path, outputFile);
-
-                    workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
-                }
-                else {
-                    workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
-                }
-
-                migrationHeartbeat.progress("Parquet To Slop", ++finishedTasks, totalTasks);
-            }
-        }
-
-        Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
-        Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
-        Files.move(newCrawlerLog, crawlerLog);
-    }
-
-
-    private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<Map.Entry<WorkLogEntry, Path>>> {
-
-        private final Path crawlRootDir;
-
-        CrawlDataLocator(Path crawlRootDir) {
-            this.crawlRootDir = crawlRootDir;
-        }
-
-        @Override
-        public Optional<Map.Entry<WorkLogEntry, Path>> apply(WorkLogEntry entry) {
-            var path = getCrawledFilePath(crawlRootDir, entry.path());
-
-            if (!Files.exists(path)) {
-                return Optional.empty();
-            }
-
-            try {
-                return Optional.of(Map.entry(entry, path));
-            }
-            catch (Exception ex) {
-                return Optional.empty();
-            }
-        }
-
-        private Path getCrawledFilePath(Path crawlDir, String fileName) {
-            int sp = fileName.lastIndexOf('/');
-
-            // Normalize the filename
-            if (sp >= 0 && sp + 1< fileName.length())
-                fileName = fileName.substring(sp + 1);
-            if (fileName.length() < 4)
-                fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
-
-            String sp1 = fileName.substring(0, 2);
-            String sp2 = fileName.substring(2, 4);
-            return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
-        }
+        return outputFile;
    }

 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java
@@ -9,6 +9,7 @@ import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
 import java.time.Instant;
+import java.util.Objects;
 import java.util.Optional;

 /** Supplemental sqlite database for storing the summary of a crawl.
@@ -60,6 +61,8 @@ public class DomainStateDb implements AutoCloseable {

    }

+    public record FaviconRecord(String contentType, byte[] imageData) {}
+
    public DomainStateDb(Path filename) throws SQLException {
        String sqliteDbString = "jdbc:sqlite:" + filename.toString();
        connection = DriverManager.getConnection(sqliteDbString);
@@ -74,7 +77,13 @@ public class DomainStateDb implements AutoCloseable {
                        feedUrl TEXT
                    )
                    """);
-
+            stmt.executeUpdate("""
+                    CREATE TABLE IF NOT EXISTS favicon (
+                        domain TEXT PRIMARY KEY,
+                        contentType TEXT NOT NULL,
+                        icon BLOB NOT NULL
+                    )
+                    """);
            stmt.execute("PRAGMA journal_mode=WAL");
        }
    }
@@ -85,6 +94,41 @@ public class DomainStateDb implements AutoCloseable {
    }


+    public void saveIcon(String domain, FaviconRecord faviconRecord) {
+        try (var stmt = connection.prepareStatement("""
+                INSERT OR REPLACE INTO favicon (domain, contentType, icon)
+                       VALUES(?, ?, ?)
+            """)) {
+            stmt.setString(1, domain);
+            stmt.setString(2, Objects.requireNonNullElse(faviconRecord.contentType, "application/octet-stream"));
+            stmt.setBytes(3, faviconRecord.imageData);
+            stmt.executeUpdate();
+        }
+        catch (SQLException ex) {
+            logger.error("Failed to insert favicon", ex);
+        }
+    }
+
+    public Optional<FaviconRecord> getIcon(String domain) {
+        try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
+            stmt.setString(1, domain);
+            var rs = stmt.executeQuery();
+
+            if (rs.next()) {
+                return Optional.of(
+                    new FaviconRecord(
+                        rs.getString("contentType"),
+                        rs.getBytes("icon")
+                    )
+                );
+            }
+        } catch (SQLException e) {
+            logger.error("Failed to retrieve favicon", e);
+        }
+
+        return Optional.empty();
+    }
+
    public void save(SummaryRecord record) {
        try (var stmt = connection.prepareStatement("""
                INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -23,12 +23,10 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URISyntaxException;
-import java.net.URLDecoder;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.net.http.HttpTimeoutException;
-import java.nio.charset.StandardCharsets;
 import java.time.Duration;
 import java.util.*;
 import java.util.concurrent.Executors;
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -96,7 +96,7 @@ public class WarcRecorder implements AutoCloseable {
        try {
            response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
        }
-        catch (IOException ex) {
+        catch (Exception ex) {
            logger.warn("Failed to fetch URL {}:  {}", requestUri, ex.getMessage());
            return new HttpFetchResult.ResultException(ex);
        }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -19,7 +19,6 @@ import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.body.DocumentBodyExtractor;
 import nu.marginalia.model.body.HttpFetchResult;
 import nu.marginalia.model.crawldata.CrawlerDomainStatus;
-import org.jsoup.Jsoup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@@ -273,7 +272,16 @@ public class CrawlerRetreiver implements AutoCloseable {
            feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));

            // Grab the favicon if it exists
-            fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
+
+            if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
+                String contentType = iconResult.header("Content-Type");
+                byte[] iconData = iconResult.getBodyBytes();
+
+                domainStateDb.saveIcon(
+                        domain,
+                        new DomainStateDb.FaviconRecord(contentType, iconData)
+                );
+            }
            timer.waitFetchDelay(0);

        }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawledDomainReader.java
@@ -5,9 +5,7 @@ import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.FileNotFoundException;
 import java.io.IOException;
-import java.nio.file.Files;
 import java.nio.file.Path;

 public class CrawledDomainReader {
@@ -26,7 +24,8 @@ public class CrawledDomainReader {
                return SerializableCrawlDataStream.empty();
            }
        }
-        else if (fileName.endsWith(".slop.zip")) {
+
+        if (fileName.endsWith(".slop.zip")) {
            try {
                return new SlopSerializableCrawlDataStream(fullPath);
            } catch (Exception ex) {
@@ -34,22 +33,21 @@ public class CrawledDomainReader {
                return SerializableCrawlDataStream.empty();
            }
        }
-        else {
+
        logger.error("Unknown file type: {}", fullPath);
        return SerializableCrawlDataStream.empty();
    }
+
+    public static int sizeHint(Path fullPath) {
+        String fileName = fullPath.getFileName().toString();
+        if (fileName.endsWith(".parquet")) {
+            return ParquetSerializableCrawlDataStream.sizeHint(fullPath);
        }
-
-    /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
-    public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
-        Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
-
-        if (Files.exists(parquetPath)) {
-            return createDataStream(parquetPath);
+        else if (fileName.endsWith(".slop.zip")) {
+            return SlopSerializableCrawlDataStream.sizeHint(fullPath);
        }
        else {
-            throw new FileNotFoundException("No such file: " + parquetPath);
+            return 0;
        }
    }
-
 }
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/CrawlerOutputFile.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/CrawlerOutputFile.java
@@ -35,19 +35,6 @@ public class CrawlerOutputFile {
        return destDir.resolve(id + "-" + filesystemSafeName(domain) + "-" + version.suffix + ".warc.gz");
    }

-    public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
-        id = padId(id);
-
-        String first = id.substring(0, 2);
-        String second = id.substring(2, 4);
-
-        Path destDir = basePath.resolve(first).resolve(second);
-        if (!Files.exists(destDir)) {
-            Files.createDirectories(destDir);
-        }
-        return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
-    }
-
    public static Path createSlopPath(Path basePath, String id, String domain) throws IOException {
        id = padId(id);

@@ -71,16 +58,17 @@ public class CrawlerOutputFile {
        return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
    }

-    public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
+    public static Path getSlopPath(Path basePath, String id, String domain) {
        id = padId(id);

        String first = id.substring(0, 2);
        String second = id.substring(2, 4);

        Path destDir = basePath.resolve(first).resolve(second);
-        return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".warc" + version.suffix);
+        return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".slop.zip");
    }

+
    /**
     * Pads the given ID with leading zeros to ensure it has a length of 4 characters.
     */
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -34,6 +34,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
    @Nullable
    default Path path() { return null; }

+    void close() throws IOException;
+
    default <T>  Iterator<T> map(Function<SerializableCrawlData, Optional<T>> mapper) {
        return new Iterator<>() {
            T next = null;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java
@@ -40,7 +40,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
        return path;
    }

-    public int sizeHint() {
+    public static int sizeHint(Path path) {
        // Only calculate size hint for large files
        // (the reason we calculate them in the first place is to assess whether it is large
        // because it has many documents, or because it is a small number of large documents)
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java
@@ -52,7 +52,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
        return path;
    }

-    public int sizeHint() {
+    public static int sizeHint(Path path) {
        // Only calculate size hint for large files
        // (the reason we calculate them in the first place is to assess whether it is large
        // because it has many documents, or because it is a small number of large documents)
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
@@ -12,6 +12,7 @@ import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
 import java.net.http.HttpHeaders;
+import java.util.Arrays;
 import java.util.Optional;

 /* FIXME:  This interface has a very unfortunate name that is not very descriptive.
@@ -58,7 +59,7 @@ public sealed interface HttpFetchResult {
                    int statusCode,
                    HttpHeaders headers,
                    String ipAddress,
-                    byte[] bytesRaw,
+                    byte[] bytesRaw, // raw data for the entire response including headers
                    int bytesStart,
                    int bytesLength
    ) implements HttpFetchResult {
@@ -75,6 +76,12 @@ public sealed interface HttpFetchResult {
            return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
        }

+        /** Copy the byte range corresponding to the payload of the response,
+            Warning:  Copies the data, use getInputStream() for zero copy access */
+        public byte[] getBodyBytes() {
+            return Arrays.copyOfRange(bytesRaw, bytesStart, bytesStart + bytesLength);
+        }
+
        public Optional<Document> parseDocument() {
            return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
                if (contentType.is("text/html")) {
--- a/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java
@@ -10,7 +10,7 @@ import java.nio.file.Path;
 import java.sql.SQLException;
 import java.time.Instant;

-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;

 class DomainStateDbTest {

@@ -26,7 +26,7 @@ class DomainStateDbTest {
    }

    @Test
-    public void testSunnyDay() throws SQLException {
+    public void testSummaryRecord() throws SQLException {
        try (var db = new DomainStateDb(tempFile)) {
            var allFields = new DomainStateDb.SummaryRecord(
                    "all.marginalia.nu",
@@ -63,4 +63,21 @@ class DomainStateDbTest {
        }
    }

+    @Test
+    public void testFavicon() throws SQLException {
+        try (var db = new DomainStateDb(tempFile)) {
+            db.saveIcon("www.marginalia.nu", new DomainStateDb.FaviconRecord("text/plain", "hello world".getBytes()));
+
+            var maybeData = db.getIcon("www.marginalia.nu");
+            assertTrue(maybeData.isPresent());
+            var actualData = maybeData.get();
+
+            assertEquals("text/plain", actualData.contentType());
+            assertArrayEquals("hello world".getBytes(), actualData.imageData());
+
+            maybeData = db.getIcon("foobar");
+            assertTrue(maybeData.isEmpty());
+        }
+    }
+
 }
--- a/code/services-application/api-service/build.gradle
+++ b/code/services-application/api-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-application/dating-service/build.gradle
+++ b/code/services-application/dating-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/explorer-service/build.gradle
+++ b/code/services-application/explorer-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/search-service-legacy/build.gradle
+++ b/code/services-application/search-service-legacy/build.gradle
@@ -5,7 +5,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'

-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-application/search-service/build.gradle
+++ b/code/services-application/search-service/build.gradle
@@ -3,7 +3,7 @@ plugins {
    id 'application'
    id 'jvm-test-suite'
    id 'gg.jte.gradle' version '3.1.15'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
@@ -104,6 +104,8 @@ task compileTailwind {

    doLast {
        exec {
+            // If you're getting a build error like 'npm error could not determine executable to run'
+            // pointing you here, you need to run  `npm install -D tailwindcss`
            workingDir projectDir
            if (System.getProperty('os.name').toLowerCase().contains('windows')) {
                commandLine 'cmd', '/c', 'npx', 'tailwindcss',
--- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java
@@ -140,7 +140,8 @@ public class SearchSiteInfoService {
    ) throws SQLException, ExecutionException {

        if (null == domainName || domainName.isBlank()) {
-            return null;
+            // If we don't get a domain name, we redirect to the /site endpoint
+            return new MapModelAndView("redirect.jte", Map.of("url", "/site"));
        }

        page = Objects.requireNonNullElse(page, 1);
--- a/code/services-application/search-service/resources/jte/serp/part/result.jte
+++ b/code/services-application/search-service/resources/jte/serp/part/result.jte
@@ -86,7 +86,7 @@
            @endif

            @if(result.getFirst().isTracking())
-                <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
+                <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
            @endif

            @if(result.getFirst().isScripts())
@@ -94,11 +94,11 @@
            @endif

            @if(result.getFirst().isAds())
-                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Ads</span>
+                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Has Ads</span>
            @endif

            @if(result.getFirst().isAffiliate())
-                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
+                <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
            @endif

        </span>
--- a/code/services-application/search-service/resources/jte/siteinfo/view/docs.jte
+++ b/code/services-application/search-service/resources/jte/siteinfo/view/docs.jte
@@ -53,7 +53,7 @@
        @endif

        @if(details.isTracking())
-            <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
+            <span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
        @endif

        @if(details.isScripts())
@@ -65,7 +65,7 @@
        @endif

        @if(details.isAffiliate())
-            <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
+            <span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
        @endif

    </div>
--- a/code/services-application/status-service/build.gradle
+++ b/code/services-application/status-service/build.gradle
@@ -2,7 +2,7 @@ plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-core/assistant-service/build.gradle
+++ b/code/services-core/assistant-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-core/control-service/build.gradle
+++ b/code/services-core/control-service/build.gradle
@@ -2,7 +2,7 @@ plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/code/services-core/executor-service/build.gradle
+++ b/code/services-core/executor-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-core/index-service/build.gradle
+++ b/code/services-core/index-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/services-core/query-service/build.gradle
+++ b/code/services-core/query-service/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 application {
--- a/code/tools/screenshot-capture-tool/build.gradle
+++ b/code/tools/screenshot-capture-tool/build.gradle
@@ -3,7 +3,7 @@ plugins {

    id 'application'
    id 'jvm-test-suite'
-    id 'com.google.cloud.tools.jib' version '3.4.3'
+    id 'com.google.cloud.tools.jib' version '3.4.4'
 }

 java {
--- a/run/readme.md
+++ b/run/readme.md
@@ -16,7 +16,7 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
 The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
 graalce is a good distribution choice but it doesn't matter too much.

-**Tailwindcss** - Install NPM and run `npm install -D tailwindcss`
+**Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`

 ## Quick Set up
Author	SHA1	Message	Date
Viktor Lofgren	503ea57d5b	(converter) Reduce lock contention in converter by separating the processing of full and simple-track domains	2025-01-26 13:18:14 +01:00
Viktor Lofgren	18ca926c7f	(converter) Truncate excessively long strings in SentenceExtractor, malformed data was effectively DOS:ing the converter	2025-01-26 12:52:54 +01:00
Viktor Lofgren	db99242db2	(converter) Adding some logging around the simple processing track to investigate an issue with the converter stalling	2025-01-26 12:02:00 +01:00
Viktor Lofgren	2b9d2985ba	(doc) Update readme with up-to-date install instructions.	2025-01-24 18:51:41 +01:00
Viktor Lofgren	eeb6ecd711	(search) Make it clearer that the affiliate marker applies to the result, and not the search engine's relation to the result.	2025-01-24 18:50:00 +01:00
Viktor Lofgren	1f58aeadbf	(build) Upgrade JIB	2025-01-24 18:49:28 +01:00
Viktor Lofgren	3d68be64da	(crawler) Add default CT when it's missing for icons	2025-01-22 13:55:47 +01:00
Viktor Lofgren	668f3b16ef	(search) Redirect ^/site/$ to /site	2025-01-22 13:35:18 +01:00
Viktor Lofgren	98a340a0d1	(crawler) Add favicon data to domain state db in its own table	2025-01-22 11:41:20 +01:00
Viktor Lofgren	8862100f7e	(crawler) Improve logging and error handling	2025-01-21 21:44:21 +01:00
Viktor Lofgren	274941f6de	(crawler) Smarter parquet->slop crawl data migration	2025-01-21 21:26:12 +01:00