(live-capture) Send a UA-string from the browserless fetcher as well

The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent
(update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete.
2025-10-06 07:32:38 +02:00 · 2025-02-04 13:36:49 +01:00 · 2025-02-03 11:35:32 +01:00 · 2025-02-01 13:13:30 +01:00 · 2025-01-31 12:47:42 +01:00 · 2025-01-31 12:40:13 +01:00
27 changed files with 363 additions and 216 deletions
--- a/code/common/config/java/nu/marginalia/LanguageModels.java
+++ b/code/common/config/java/nu/marginalia/LanguageModels.java
@@ -24,58 +24,4 @@ public class LanguageModels {
        this.fasttextLanguageModel = fasttextLanguageModel;
        this.segments = segments;
    }
    public static LanguageModelsBuilder builder() {
        return new LanguageModelsBuilder();
    }
    public static class LanguageModelsBuilder {
        private Path termFrequencies;
        private Path openNLPSentenceDetectionData;
        private Path posRules;
        private Path posDict;
        private Path fasttextLanguageModel;
        private Path segments;
        LanguageModelsBuilder() {
        }
        public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
            this.termFrequencies = termFrequencies;
            return this;
        }
        public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
            this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
            return this;
        }
        public LanguageModelsBuilder posRules(Path posRules) {
            this.posRules = posRules;
            return this;
        }
        public LanguageModelsBuilder posDict(Path posDict) {
            this.posDict = posDict;
            return this;
        }
        public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
            this.fasttextLanguageModel = fasttextLanguageModel;
            return this;
        }
        public LanguageModelsBuilder segments(Path segments) {
            this.segments = segments;
            return this;
        }
        public LanguageModels build() {
            return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
        }
        public String toString() {
            return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
        }
    }
 }
--- a/code/common/service/java/nu/marginalia/process/log/WorkLog.java
+++ b/code/common/service/java/nu/marginalia/process/log/WorkLog.java
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.LocalDateTime;
-import java.util.*;
+import java.util.HashSet;
 import java.util.Optional;
 import java.util.Set;
 import java.util.function.Function;
 /** WorkLog is a journal of work done by a process,
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
        return new WorkLoadIterable<>(logFile, mapper);
    }
    public static int countEntries(Path crawlerLog) throws IOException{
        try (var linesStream = Files.lines(crawlerLog)) {
            return (int) linesStream.filter(WorkLogEntry::isJobId).count();
        }
    }
    // Use synchro over concurrent set to avoid competing writes
    // - correct is better than fast here, it's sketchy enough to use
    // a PrintWriter
--- a/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
+++ b/code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
 import nu.marginalia.nodecfg.NodeConfigurationService;
 import nu.marginalia.nodecfg.model.NodeProfile;
 import nu.marginalia.service.module.ServiceConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.time.Duration;
 import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
    private final NodeConfigurationService nodeConfigurationService;
    private final MqPersistence persistence;
    private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
    @Inject
    public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
            case UpdateRefresh(int count, long msgId) -> {
                MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
                if (msg == null) {
-                    // Retry the update
+                    logger.warn("UpdateRefresh is taking a very long time");
-                    yield new Error("Failed to update feeds: message not found");
+                    yield new UpdateRefresh(count, msgId);
                } else if (msg.state() != MqMessageState.OK) {
                    // Retry the update
                    yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
            case UpdateClean(long msgId) -> {
                MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
                if (msg == null) {
-                    // Retry the update
+                    logger.warn("UpdateClean is taking a very long time");
-                    yield new Error("Failed to update feeds: message not found");
+                    yield new UpdateClean(msgId);
                } else if (msg.state() != MqMessageState.OK) {
                    // Retry the update
                    yield new Error("Failed to update feeds: " + msg.state());
--- a/code/execution/java/nu/marginalia/actor/task/MigrateCrawlDataActor.java
+++ b/code/execution/java/nu/marginalia/actor/task/MigrateCrawlDataActor.java
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
 import nu.marginalia.io.CrawlerOutputFile;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.process.log.WorkLogEntry;
 import nu.marginalia.service.control.ServiceHeartbeat;
 import nu.marginalia.slop.SlopCrawlDataRecord;
 import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.util.Map;
 import java.util.Optional;
 import java.util.function.Function;
@@ -26,14 +28,15 @@ import java.util.function.Function;
 public class MigrateCrawlDataActor extends RecordActorPrototype {
    private final FileStorageService fileStorageService;
-
+    private final ServiceHeartbeat serviceHeartbeat;
    private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
    @Inject
-    public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) {
+    public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
        super(gson);
        this.fileStorageService = fileStorageService;
        this.serviceHeartbeat = serviceHeartbeat;
    }
    public record Run(long fileStorageId) implements ActorStep {}
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
                Path crawlerLog = root.resolve("crawler.log");
                Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
-                try (WorkLog workLog = new WorkLog(newCrawlerLog)) {
+                int totalEntries = WorkLog.countEntries(crawlerLog);
                try (WorkLog workLog = new WorkLog(newCrawlerLog);
                     var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
                ) {
                    int entryIdx = 0;
                    for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
-                        var entry = item.getKey();
+                        final WorkLogEntry entry = item.getKey();
-                        var path = item.getValue();
+                        final Path inputPath = item.getValue();
-                        logger.info("Converting {}", entry.id());
+                        Path outputPath = inputPath;
                        heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
-
+                        if (inputPath.toString().endsWith(".parquet")) {
                        if (path.toFile().getName().endsWith(".parquet")) {
                            String domain = entry.id();
                            String id = Integer.toHexString(domain.hashCode());
-                            Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
+                            outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
-                            SlopCrawlDataRecord.convertFromParquet(path, outputFile);
+                            if (Files.exists(inputPath)) {
                                try {
                                    SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
                                    Files.deleteIfExists(inputPath);
                                } catch (Exception ex) {
                                    outputPath = inputPath; // don't update the work log on error
                                    logger.error("Failed to convert " + inputPath, ex);
                                }
                            }
                            else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
                                // if the input file is missing, and the output file is missing, we just write the log
                                // record identical to the old one
                                outputPath = inputPath;
                            }
                        }
-                            workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
+                        // Write a log entry for the (possibly) converted file
-                        }
+                        workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
                        else {
                            workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
                        }
                    }
                }
                Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
-                Files.move(crawlerLog, oldCrawlerLog);
+                Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
                Files.move(newCrawlerLog, crawlerLog);
                yield new End();
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@@ -34,6 +34,7 @@ dependencies {
    implementation libs.bundles.slf4j
    implementation libs.commons.lang3
    implementation libs.commons.io
    implementation libs.wiremock
    implementation libs.prometheus
    implementation libs.guava
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
@@ -1,6 +1,7 @@
 package nu.marginalia.livecapture;
 import com.google.gson.Gson;
 import nu.marginalia.WmsaHome;
 import nu.marginalia.model.gson.GsonFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -27,6 +28,8 @@ public class BrowserlessClient implements AutoCloseable {
    private final URI browserlessURI;
    private final Gson gson = GsonFactory.get();
    private final String userAgent = WmsaHome.getUserAgent().uaString();
    public BrowserlessClient(URI browserlessURI) {
        this.browserlessURI = browserlessURI;
    }
@@ -34,6 +37,7 @@ public class BrowserlessClient implements AutoCloseable {
    public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
        Map<String, Object> requestData = Map.of(
                "url", url,
                "userAgent", userAgent,
                "gotoOptions", gotoOptions
        );
@@ -60,6 +64,7 @@ public class BrowserlessClient implements AutoCloseable {
        Map<String, Object> requestData = Map.of(
                "url", url,
                "userAgent", userAgent,
                "options", screenshotOptions,
                "gotoOptions", gotoOptions
        );
--- a/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
@@ -1,5 +1,8 @@
 package nu.marginalia.livecapture;
 import com.github.tomakehurst.wiremock.WireMockServer;
 import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
 import nu.marginalia.WmsaHome;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Tag;
@@ -8,19 +11,74 @@ import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.junit.jupiter.Testcontainers;
 import org.testcontainers.utility.DockerImageName;
 import java.io.IOException;
 import java.net.SocketException;
 import java.net.URI;
-import java.util.Map;
+import java.util.Map;import static com.github.tomakehurst.wiremock.client.WireMock.*;
 import static java.net.NetworkInterface.getNetworkInterfaces;
@Testcontainers
@Tag("slow")
 public class BrowserlessClientTest {
    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
            .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
            .withNetworkMode("bridge")
            .withExposedPorts(3000);
    static WireMockServer wireMockServer = new WireMockServer(WireMockConfiguration.wireMockConfig().port(18089));
    static String localIp;
    @BeforeAll
-    public static void setup() {
+    public static void setup() throws IOException {
        container.start();
        wireMockServer.start();
        wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
        localIp = findLocalIp();
    }
    private static String findLocalIp() throws SocketException {
        var interfaces = getNetworkInterfaces();
        while (interfaces.hasMoreElements()) {
            var iface = interfaces.nextElement();
            if (iface.isLoopback())
                continue;
            else if (iface.isVirtual())
                continue;
            var addresses = iface.getInetAddresses();
            while (addresses.hasMoreElements()) {
                var address = addresses.nextElement();
                if (!address.isSiteLocalAddress()) continue;
                return address.getHostAddress();
            }
        }
        return "127.0.0.1";
    }
    @Tag("flaky")
    @Test
    public void testInspectContentUA__Flaky() throws Exception {
        try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
            client.content("http://" + localIp + ":18089/", BrowserlessClient.GotoOptions.defaultValues());
        }
        wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
    }
    @Tag("flaky")
    @Test
    public void testInspectScreenshotUA__Flaky() throws Exception {
        try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
            client.screenshot("http://" + localIp + ":18089/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
        }
        wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
    }
    @Test
--- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
+++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java
@@ -134,6 +134,10 @@ public class QueryExpansion {
                if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
                    graph.addVariantForSpan(prev, qw, joinedWord);
                }
                else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
                    graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
                    graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
                }
            }
            prev = qw;
--- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
+++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java
@@ -213,6 +213,18 @@ public class QueryFactoryTest {
        System.out.println(subquery);
    }
    @Test
    public void testContractionWordNum() {
        var subquery = parseAndGetSpecs("glove 80");
        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
        Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
        Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
    }
    @Test
    public void testCplusPlus() {
        var subquery = parseAndGetSpecs("std::vector::push_back vector");
--- a/code/libraries/message-queue/java/nu/marginalia/actor/prototype/RecordActorPrototype.java
+++ b/code/libraries/message-queue/java/nu/marginalia/actor/prototype/RecordActorPrototype.java
@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.List;
 public abstract class RecordActorPrototype implements ActorPrototype {
@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
        }
        private String functionName(Class<? extends ActorStep> functionClass) {
-            return functionClass.getSimpleName().toUpperCase();
+            return ActorStep.functionName(functionClass);
        }
        private ActorStep constructState(String message) throws ReflectiveOperationException {
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
        }
    }
    /** Get a list of JSON prototypes for each actor step declared by this actor */
    @SuppressWarnings("unchecked")
    public Map<String, String> getMessagePrototypes() {
        Map<String, String> messagePrototypes = new HashMap<>();
        for (var clazz : getClass().getDeclaredClasses()) {
            if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
                continue;
            StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
            renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
            messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
        }
        return messagePrototypes;
    }
    @SuppressWarnings("unchecked")
    private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
        for (var field : recordType.getDeclaredFields()) {
            String typeName = field.getType().getSimpleName();
            if ("List".equals(typeName)) {
                sj.add(String.format("\"%s\": [ ]", field.getName()));
            }
            else if (field.getType().isRecord()) {
                var innerSj = new StringJoiner(",", "{", "}");
                renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
                sj.add(String.format("\"%s\": %s", field.getName(), sj));
            }
            else {
                sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
            }
        }
    }
 }
--- a/code/libraries/message-queue/java/nu/marginalia/actor/state/ActorStep.java
+++ b/code/libraries/message-queue/java/nu/marginalia/actor/state/ActorStep.java
@@ -1,3 +1,7 @@
 package nu.marginalia.actor.state;
-public interface ActorStep {}
+public interface ActorStep {
    static String functionName(Class<? extends ActorStep> type) {
        return type.getSimpleName().toUpperCase();
    }
 }
--- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java
@@ -35,6 +35,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Optional;
@@ -203,11 +204,18 @@ public class ConverterMain extends ProcessMainClass {
            logger.info("Processing small items");
            // We separate the large and small domains to reduce the number of critical sections,
            // as the large domains have a separate processing track that doesn't store everything
            // in memory
            final List<Path> bigTasks = new ArrayList<>();
            // First process the small items
            for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
            {
                if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
                    bigTasks.add(dataPath);
                    continue;
                }
@@ -235,30 +243,28 @@ public class ConverterMain extends ProcessMainClass {
            logger.info("Processing large items");
-            // Next the big items domain-by-domain
+            try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
-            for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
+                int bigTaskIdx = 0;
-                    new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
+                // Next the big items domain-by-domain
-            {
+                for (var dataPath : bigTasks) {
-                int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
+                    hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
                if (sizeHint < SIDELOAD_THRESHOLD) {
                    continue;
                }
-                try {
+                    try {
-                    // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
+                        // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
-                    // closed before it's consumed by the converterWriter.  Instead, the converterWriter guarantees it
+                        // closed before it's consumed by the converterWriter.  Instead, the converterWriter guarantees it
-                    // will close it after it's consumed.
+                        // will close it after it's consumed.
-                    var stream = SerializableCrawlDataStream.openDataStream(dataPath);
+                        var stream = SerializableCrawlDataStream.openDataStream(dataPath);
-                    ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint);
+                        ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
-                    converterWriter.accept(writable);
+                        converterWriter.accept(writable);
-                }
+                    }
-                catch (Exception ex) {
+                    catch (Exception ex) {
-                    logger.info("Error in processing", ex);
+                        logger.info("Error in processing", ex);
-                }
+                    }
-                finally {
+                    finally {
-                    heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
+                        heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
                    }
                }
            }
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java
@@ -116,7 +116,7 @@ public class AdblockSimulator {
    // Refrain from cleaning up this code, it's very hot code and needs to be fast.
-    // This version is about 100x faster than the a "clean" first stab implementation.
+    // This version is about 100x faster than a "clean" first stab implementation.
    class RuleVisitor implements NodeFilter {
        public boolean sawAds;
--- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java
@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
        var tags = doc.select("meta[name=generator]");
-        if (tags.size() == 0) {
+        if (tags.isEmpty()) {
            // Some sites have a comment in the head instead of a meta tag
            return fingerprintServerTech(doc, responseHeaders);
        }
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java
@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
        }
        fullHtml.append("</div></body></html>");
-        var doc = sideloaderProcessing
+        return sideloaderProcessing
                .processDocument(fullUrl,
                        fullHtml.toString(),
                        List.of("encyclopedia", "wiki"),
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
                        anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
                        LocalDate.now().getYear(),
                        10_000_000);
        return doc;
    }
    private String normalizeUtf8(String url) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java
@@ -20,7 +20,6 @@ import nu.marginalia.crawl.warc.WarcArchiverFactory;
 import nu.marginalia.crawl.warc.WarcArchiverIf;
 import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.io.CrawlerOutputFile;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.mq.MessageQueueFactory;
 import nu.marginalia.process.ProcessConfiguration;
@@ -417,13 +416,13 @@ public class CrawlerMain extends ProcessMainClass {
            try {
                Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
                if (Files.exists(slopPath)) {
-                    return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
+                    return new CrawlDataReference(slopPath);
                }
                Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
                if (Files.exists(parquetPath)) {
                    slopPath = migrateParquetData(parquetPath, domain, outputDir);
-                    return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
+                    return new CrawlDataReference(slopPath);
                }
            } catch (IOException e) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -45,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
    private final Duration requestTimeout = Duration.ofSeconds(10);
    private final Duration probeTimeout = Duration.ofSeconds(30);
    @Override
    public void setAllowAllContentTypes(boolean allowAllContentTypes) {
@@ -107,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .HEAD()
                    .uri(url.asURI())
                    .header("User-agent", userAgentString)
-                    .timeout(requestTimeout)
+                    .timeout(probeTimeout)
                    .build();
        } catch (URISyntaxException e) {
            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
        }
-        try {
+        for (int tries = 0;; tries++) {
-            var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
+            try {
-            EdgeUrl rspUri = new EdgeUrl(rsp.uri());
+                var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
                EdgeUrl rspUri = new EdgeUrl(rsp.uri());
-            if (!Objects.equals(rspUri.domain, url.domain)) {
+                if (!Objects.equals(rspUri.domain, url.domain)) {
-                return new DomainProbeResult.Redirect(rspUri.domain);
+                    return new DomainProbeResult.Redirect(rspUri.domain);
                }
                return new DomainProbeResult.Ok(rspUri);
            } catch (Exception ex) {
                if (tries > 3) {
                    return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
                }
                // else try again ...
            }
            return new DomainProbeResult.Ok(rspUri);
        }
        catch (Exception ex) {
            return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
        }
    }
@@ -143,7 +148,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                var headBuilder = HttpRequest.newBuilder()
                    .HEAD()
                    .uri(url.asURI())
-                    .header("User-agent", userAgentString)
+                    .header("User-Agent", userAgentString)
                    .header("Accept-Encoding", "gzip")
                    .timeout(requestTimeout)
                    ;
@@ -215,7 +220,7 @@ public class HttpFetcherImpl implements HttpFetcher {
        var getBuilder = HttpRequest.newBuilder()
                .GET()
                .uri(url.asURI())
-                .header("User-agent", userAgentString)
+                .header("User-Agent", userAgentString)
                .header("Accept-Encoding", "gzip")
                .header("Accept-Language", "en,*;q=0.5")
                .header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
@@ -307,7 +312,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                .uri(sitemapUrl.asURI())
                .header("Accept-Encoding", "gzip")
                .header("Accept", "text/*, */*;q=0.9")
-                .header("User-agent", userAgentString)
+                .header("User-Agent", userAgentString)
                .timeout(requestTimeout)
                .build();
@@ -386,7 +391,7 @@ public class HttpFetcherImpl implements HttpFetcher {
                    .uri(url.asURI())
                    .header("Accept-Encoding", "gzip")
                    .header("Accept", "text/*, */*;q=0.9")
-                    .header("User-agent", userAgentString)
+                    .header("User-Agent", userAgentString)
                    .timeout(requestTimeout);
            HttpFetchResult result = recorder.fetch(client, getRequest.build());
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java
@@ -4,6 +4,7 @@ import nu.marginalia.ContentTypes;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.lsh.EasyLSH;
 import nu.marginalia.model.crawldata.CrawledDocument;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -11,51 +12,73 @@ import javax.annotation.Nullable;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Iterator;
 import java.util.Objects;
 import java.util.Optional;
 /** A reference to a domain that has been crawled before. */
-public class CrawlDataReference implements AutoCloseable {
+public class CrawlDataReference implements AutoCloseable, Iterable<CrawledDocument> {
    private boolean closed = false;
    @Nullable
    private final Path path;
    @Nullable
    private SerializableCrawlDataStream data = null;
    private final SerializableCrawlDataStream data;
    private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
-    public CrawlDataReference(SerializableCrawlDataStream data) {
+    public CrawlDataReference(@Nullable Path path) {
-        this.data = data;
+        this.path = path;
    }
    public CrawlDataReference() {
-        this(SerializableCrawlDataStream.empty());
+        this(null);
    }
    /** Delete the associated data from disk, if it exists */
    public void delete() throws IOException {
-        Path filePath = data.path();
+        if (path != null) {
-
+            Files.deleteIfExists(path);
        if (filePath != null) {
            Files.deleteIfExists(filePath);
        }
    }
-    /** Get the next document from the crawl data,
+    public @NotNull Iterator<CrawledDocument> iterator() {
     * returning null when there are no more documents
     * available
     */
    @Nullable
    public CrawledDocument nextDocument() {
        try {
            while (data.hasNext()) {
                if (data.next() instanceof CrawledDocument doc) {
                    if (!ContentTypes.isAccepted(doc.contentType))
                        continue;
-                    return doc;
+        requireStream();
        // Guaranteed by requireStream, but helps java
        Objects.requireNonNull(data);
        return data.map(next -> {
            if (next instanceof CrawledDocument doc && ContentTypes.isAccepted(doc.contentType)) {
                return Optional.of(doc);
            }
            else {
                return Optional.empty();
            }
        });
    }
    /** After calling this method, data is guaranteed to be non-null */
    private void requireStream() {
        if (closed) {
            throw new IllegalStateException("Use after close()");
        }
        if (data == null) {
            try {
                if (path != null) {
                    data = SerializableCrawlDataStream.openDataStream(path);
                    return;
                }
            }
-        }
+            catch (Exception ex) {
-        catch (IOException ex) {
+                logger.error("Failed to open stream", ex);
-            logger.error("Failed to read next document", ex);
+            }
        }
-        return null;
+            data = SerializableCrawlDataStream.empty();
        }
    }
    public static boolean isContentBodySame(byte[] one, byte[] other) {
@@ -98,7 +121,12 @@ public class CrawlDataReference implements AutoCloseable {
    }
    @Override
-    public void close() throws Exception {
+    public void close() throws IOException {
-        data.close();
+        if (!closed) {
            if (data != null) {
                data.close();
            }
            closed = true;
        }
    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java
@@ -89,30 +89,45 @@ public class CrawlerRetreiver implements AutoCloseable {
    }
    public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
-        try {
+        try (oldCrawlData) {
            // Do an initial domain probe to determine the root URL
            EdgeUrl rootUrl;
            var probeResult = probeRootUrl();
-            switch (probeResult) {
+
            return switch (probeResult) {
                case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
-                    rootUrl = probedUrl; // Good track
+
                    // Sleep after the initial probe, we don't have access to the robots.txt yet
                    // so we don't know the crawl delay
                    TimeUnit.SECONDS.sleep(1);
                    final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
                    final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
                    delayTimer.waitFetchDelay(0); // initial delay after robots.txt
                    DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
                    domainStateDb.save(summaryRecord);
                    // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
                    if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
                        // If we have reference data, we will always grow the crawl depth a bit
                        crawlFrontier.increaseDepth(1.5, 2500);
                    }
                    oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
                    yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
                }
                case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
-                    return 1;
+                    yield 1;
                }
                case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
                    domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
-                    return 1;
+                    yield 1;
                }
-            }
+            };
            // Sleep after the initial probe, we don't have access to the robots.txt yet
            // so we don't know the crawl delay
            TimeUnit.SECONDS.sleep(1);
            return crawlDomain(oldCrawlData, rootUrl, domainLinks);
        }
        catch (Exception ex) {
            logger.error("Error crawling domain {}", domain, ex);
@@ -120,28 +135,15 @@ public class CrawlerRetreiver implements AutoCloseable {
        }
    }
-    private int crawlDomain(CrawlDataReference oldCrawlData,
+    private int crawlDomain(EdgeUrl rootUrl,
-                            EdgeUrl rootUrl,
+                            SimpleRobotRules robotsRules,
-                            DomainLinks domainLinks) throws InterruptedException {
+                            CrawlDelayTimer delayTimer,
                            DomainLinks domainLinks) {
        final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
        final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
        delayTimer.waitFetchDelay(0); // initial delay after robots.txt
        DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
        domainStateDb.save(summaryRecord);
        // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
        if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
            // If we have reference data, we will always grow the crawl depth a bit
            crawlFrontier.increaseDepth(1.5, 2500);
        }
        // Add external links to the crawl frontier
        crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
        // Fetch sitemaps
        for (var sitemap : robotsRules.getSitemaps()) {
            crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
@@ -379,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
                if (docOpt.isPresent()) {
                    var doc = docOpt.get();
-                    crawlFrontier.enqueueLinksFromDocument(top, doc);
+                    var responseUrl = new EdgeUrl(ok.uri());
-                    crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
+
                    crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
                    crawlFrontier.addVisited(responseUrl);
                }
            }
            else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java
@@ -40,18 +40,12 @@ public class CrawlerRevisitor {
        int errors = 0;
        int skipped = 0;
-        for (;;) {
+        for (CrawledDocument doc : oldCrawlData) {
            if (errors > 20) {
                // If we've had too many errors, we'll stop trying to recrawl
                break;
            }
            CrawledDocument doc = oldCrawlData.nextDocument();
            if (doc == null)
                break;
            // This Shouldn't Happen (TM)
            var urlMaybe = EdgeUrl.parse(doc.url);
            if (urlMaybe.isEmpty())
                continue;
--- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java
@@ -12,8 +12,7 @@ import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.URI;
 import java.net.http.HttpHeaders;
-import java.util.Arrays;
+import java.util.*;
 import java.util.Optional;
 /* FIXME:  This interface has a very unfortunate name that is not very descriptive.
 */
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
    ) implements HttpFetchResult {
        public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
-            this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
+            this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
        }
        private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
            Map<String, List<String>> inputMap = messageHeaders.map();
            Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
            inputMap.forEach((k, v) -> {
                if (k.isBlank()) return;
                if (!Character.isAlphabetic(k.charAt(0))) return;
                filteredMap.put(k, v);
            });
            return HttpHeaders.of(filteredMap, (k,v) -> true);
        }
        public boolean isOk() {
--- a/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/slop/SlopCrawlDataRecord.java
@@ -108,15 +108,17 @@ public record SlopCrawlDataRecord(String domain,
    public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
        Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
-        try (var writer = new Writer(tempDir)) {
+        try (var writer = new Writer(tempDir);
-            CrawledDocumentParquetRecordFileReader.stream(parquetInput).forEach(
+             var stream = CrawledDocumentParquetRecordFileReader.stream(parquetInput))
-                    parquetRecord -> {
+        {
-                        try {
+            stream.forEach(
-                            writer.write(new SlopCrawlDataRecord(parquetRecord));
+                parquetRecord -> {
-                        } catch (IOException e) {
+                    try {
-                            throw new RuntimeException(e);
+                        writer.write(new SlopCrawlDataRecord(parquetRecord));
-                        }
+                    } catch (IOException e) {
-                    });
+                        throw new RuntimeException(e);
                    }
                });
        }
        catch (IOException ex) {
            FileUtils.deleteDirectory(tempDir.toFile());
--- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
+++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java
@@ -375,7 +375,7 @@ class CrawlerRetreiverTest {
        doCrawl(tempFileWarc1, specs);
        convertToParquet(tempFileWarc1, tempFileParquet1);
        doCrawlWithReferenceStream(specs,
-                SerializableCrawlDataStream.openDataStream(tempFileParquet1)
+                new CrawlDataReference(tempFileParquet1)
        );
        convertToParquet(tempFileWarc2, tempFileParquet2);
@@ -447,11 +447,9 @@ class CrawlerRetreiverTest {
            throw new RuntimeException(e);
        }
        var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1);
        System.out.println("---");
-        doCrawlWithReferenceStream(specs, stream);
+        doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
        var revisitCrawlFrontier = new DomainCrawlFrontier(
                new EdgeDomain("www.marginalia.nu"),
@@ -508,12 +506,11 @@ class CrawlerRetreiverTest {
        }
    }
-    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
+    private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
        try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
             var db = new DomainStateDb(tempFileDb)
        ) {
-            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
+            new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
                    new CrawlDataReference(stream));
        }
        catch (IOException | SQLException ex) {
            Assertions.fail(ex);
--- a/code/services-application/search-service/resources/jte/part/footerLegal.jte
+++ b/code/services-application/search-service/resources/jte/part/footerLegal.jte
@@ -9,7 +9,7 @@
        <span>
        Access logs containing IP-addresses are retained for up to 24 hours,
        anonymized logs with source addresses removed are sometimes kept longer
-        for to help diagnosing bugs.
+        to help diagnose bugs.
        </span>
    </div>
    <div class="flex space-y-4 flex-col">
@@ -33,4 +33,4 @@
        </span>
    </div>
-</footer>
+</footer>
--- a/run/readme.md
+++ b/run/readme.md
@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
 The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
 graalce is a good distribution choice but it doesn't matter too much.
 **Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`
 ## Quick Set up
 [https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
--- a/run/setup.sh
+++ b/run/setup.sh
@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
 download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
 popd
 pushd $(dirname $0)/..
 npm install -D tailwindcss@3
 popd
--- a/settings.gradle
+++ b/settings.gradle
@@ -234,11 +234,12 @@ dependencyResolutionManagement {
            library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
            library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
-            library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-5-SNAPSHOT')
+            library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
            library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
            library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
            library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
            library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
            library('jte','gg.jte','jte').version('3.1.15')
            bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
Author	SHA1	Message	Date
Viktor Lofgren	fbba392491	(live-capture) Send a UA-string from the browserless fetcher as well The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent	2025-02-04 13:36:49 +01:00
Viktor Lofgren	530eb35949	(update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete.	2025-02-03 11:35:32 +01:00
Viktor Lofgren	c2dd2175a2	(search) Add new query expansion rule contracting WORD NUM pairs into WORD-NUM and WORDNUM	2025-02-01 13:13:30 +01:00
Viktor Lofgren	b8581b0f56	(crawler) Safe sanitization of headers during warc->slop conversion The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation. Fixing this by manually filtering these out. Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem.	2025-01-31 12:47:42 +01:00
Viktor Lofgren	2ea34767d8	(crawler) Use the response URL when resolving relative links The crawler was incorrectly using the request URL as the base URL when resolving relative links. This caused problems when encountering redirects. For example if we fetch /log, redirecting to /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar.	2025-01-31 12:40:13 +01:00
Viktor Lofgren	e9af838231	(actor) Fix migration actor final steps	2025-01-30 11:48:21 +01:00
Viktor Lofgren	ae0cad47c4	(actor) Utility method for getting a json prototype for actor states If we can hook this into the control gui somehow, it'll make for a nice QOL upgrade when manually interacting with the actors.	2025-01-29 15:20:25 +01:00
Viktor Lofgren	5fbc8ef998	(misc) Tidying	2025-01-29 15:17:04 +01:00
Viktor Lofgren	32c6dd9e6a	(actor) Delete old data in the migration actor	2025-01-29 14:51:46 +01:00
Viktor Lofgren	6ece6a6cfb	(actor) Improve resilience for the migration actor	2025-01-29 14:43:09 +01:00
Viktor Lofgren	39cd1c18f8	Automatically run npm install tailwindcss@3 via setup.sh, as the new default version of the package is incompatible with the project	2025-01-29 12:21:08 +01:00
Viktor	eb65daaa88	Merge pull request #151 from Lionstiger/master fix small grammar error in footerLegal.jte	2025-01-28 21:49:50 +01:00
Viktor	0bebdb6e33	Merge branch 'master' into master	2025-01-28 21:49:36 +01:00
Viktor Lofgren	1e50e392c6	(actor) Improve logging and error handling for data migration actor	2025-01-28 15:34:36 +01:00
Viktor Lofgren	fb673de370	(crawler) Change the header 'User-agent' to 'User-Agent'	2025-01-28 15:34:16 +01:00
Viktor Lofgren	eee73ab16c	(crawler) Be more lenient when performing a domain probe	2025-01-28 15:24:30 +01:00
Viktor Lofgren	5354e034bf	(search) Minor grammar fix	2025-01-27 18:36:31 +01:00
Magnus Wulf	72384ad6ca	fix small grammar error	2025-01-27 15:04:57 +01:00
Viktor Lofgren	a2b076f9be	(converter) Add progress tracking for big domains in converter	2025-01-26 18:03:59 +01:00
Viktor Lofgren	c8b0a32c0f	(crawler) Reduce long retention of CrawlDataReference objects and their associated SerializableCrawlDataStreams	2025-01-26 15:40:17 +01:00