(search) Add link promoting the redesign beta

(deploy) Add hashbang to deploy script
(crawler) Add a new system property crawler.maxFetchSize
2025-10-05 21:22:39 +02:00 · 2024-12-30 15:47:13 +01:00 · 2024-12-30 15:47:13 +01:00 · 2024-12-30 15:10:11 +01:00 · 2024-12-27 20:56:42 +01:00 · 2024-12-27 20:54:42 +01:00
8 changed files with 112 additions and 66 deletions
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java
@@ -20,34 +20,11 @@ public record ContentTags(String etag, String lastMod) {
    public void paint(Request.Builder getBuilder) {

        if (etag != null) {
-            getBuilder.addHeader("If-None-Match", ifNoneMatch());
+            getBuilder.addHeader("If-None-Match", etag);
        }

        if (lastMod != null) {
-            getBuilder.addHeader("If-Modified-Since", ifModifiedSince());
+            getBuilder.addHeader("If-Modified-Since", lastMod);
        }
    }
-
-    private String ifNoneMatch() {
-        // Remove the W/ prefix if it exists
-
-        //'W/' (case-sensitive) indicates that a weak validator is used. Weak etags are
-        // easy to generate, but are far less useful for comparisons. Strong validators
-        // are ideal for comparisons but can be very difficult to generate efficiently.
-        // Weak ETag values of two representations of the same resources might be semantically
-        // equivalent, but not byte-for-byte identical. This means weak etags prevent caching
-        // when byte range requests are used, but strong etags mean range requests can
-        // still be cached.
-        // - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
-
-        if (null != etag && etag.startsWith("W/")) {
-            return etag.substring(2);
-        } else {
-            return etag;
-        }
-    }
-
-    private String ifModifiedSince() {
-        return lastMod;
-    }
 }
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java
@@ -34,8 +34,9 @@ import java.util.*;
 public class WarcRecorder implements AutoCloseable {
    /** Maximum time we'll wait on a single request */
    static final int MAX_TIME = 30_000;
-    /** Maximum (decompressed) size we'll fetch */
-    static final int MAX_SIZE = 1024 * 1024 * 10;
+
+    /** Maximum (decompressed) size we'll save */
+    static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);

    private final WarcWriter writer;
    private final Path warcFile;
--- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
+++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java
@@ -1,11 +1,15 @@
 package nu.marginalia.io;

+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.model.crawldata.CrawledDomain;
 import nu.marginalia.model.crawldata.SerializableCrawlData;
 import org.jetbrains.annotations.Nullable;

 import java.io.IOException;
 import java.nio.file.Path;
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;

 /** Closable iterator exceptional over serialized crawl data
 * The data may appear in any order, and the iterator must be closed.
@@ -26,6 +30,37 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
    @Nullable
    default Path path() { return null; }

+    /** For tests */
+    default List<SerializableCrawlData> asList() throws IOException {
+        List<SerializableCrawlData> data = new ArrayList<>();
+        while (hasNext()) {
+            data.add(next());
+        }
+        return data;
+    }
+
+    /** For tests */
+    default List<CrawledDocument> docsAsList() throws IOException {
+        List<CrawledDocument> data = new ArrayList<>();
+        while (hasNext()) {
+            if (next() instanceof CrawledDocument doc) {
+                data.add(doc);
+            }
+        }
+        return data;
+    }
+
+    /** For tests */
+    default List<CrawledDomain> domainsAsList() throws IOException {
+        List<CrawledDomain> data = new ArrayList<>();
+        while (hasNext()) {
+            if (next() instanceof CrawledDomain domain) {
+                data.add(domain);
+            }
+        }
+        return data;
+    }
+
    // Dummy iterator over nothing
    static SerializableCrawlDataStream empty() {
        return new SerializableCrawlDataStream() {
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
@@ -26,6 +26,7 @@ import java.net.http.HttpHeaders;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.time.Duration;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.ThreadLocalRandom;
@@ -47,6 +48,8 @@ public class SimpleLinkScraper implements AutoCloseable {
    private final Duration readTimeout = Duration.ofSeconds(10);
    private final DomainLocks domainLocks = new DomainLocks();

+    private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
+
    public SimpleLinkScraper(LiveCrawlDataSet dataSet,
                             DbDomainQueries domainQueries,
                             DomainBlacklist domainBlacklist) {
@@ -65,52 +68,68 @@ public class SimpleLinkScraper implements AutoCloseable {
        pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls));
    }

-    public void retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
+    public int retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
+
+        EdgeUrl rootUrl = domain.toRootUrlHttps();
+
+        List<EdgeUrl> relevantUrls = new ArrayList<>();
+
+        for (var url : urls) {
+            Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
+            if (optParsedUrl.isEmpty()) {
+                continue;
+            }
+            if (dataSet.hasUrl(optParsedUrl.get())) {
+                continue;
+            }
+            relevantUrls.add(optParsedUrl.get());
+        }
+
+        if (relevantUrls.isEmpty()) {
+            return 0;
+        }
+
+        int fetched = 0;
+
        try (HttpClient client = HttpClient
                .newBuilder()
                .connectTimeout(connectTimeout)
                .followRedirects(HttpClient.Redirect.NEVER)
                .version(HttpClient.Version.HTTP_2)
                .build();
-             DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) // throttle concurrent access per domain; do not remove
+             // throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
+             DomainLocks.DomainLock lock = domainLocks.lockDomain(domain)
        ) {
-
-            EdgeUrl rootUrl = domain.toRootUrlHttps();
-
            SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);

            if (rules == null) { // I/O error fetching robots.txt
                // If we can't fetch the robots.txt,
-                for (var url : urls) {
-                    lp.parseLink(rootUrl, url).ifPresent(this::maybeFlagAsBad);
+                for (var url : relevantUrls) {
+                    maybeFlagAsBad(url);
                }
-                return;
+                return fetched;
            }

            CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());

-            for (var url : urls) {
-                Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
-                if (optParsedUrl.isEmpty()) {
-                    continue;
-                }
-                if (dataSet.hasUrl(optParsedUrl.get())) {
-                    continue;
-                }
+            for (var parsedUrl : relevantUrls) {

-                EdgeUrl parsedUrl = optParsedUrl.get();
-                if (!rules.isAllowed(url)) {
+                if (!rules.isAllowed(parsedUrl.toString())) {
                    maybeFlagAsBad(parsedUrl);
                    continue;
                }

                switch (fetchUrl(domainId, parsedUrl, timer, client)) {
-                    case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
-                            -> dataSet.saveDocument(id, docUrl, body, headers, "");
+                    case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
+                            dataSet.saveDocument(id, docUrl, body, headers, "");
+                            fetched++;
+                    }
                    case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
                }
            }
        }
+
+        return fetched;
    }

    private void maybeFlagAsBad(EdgeUrl url) {
@@ -190,7 +209,7 @@ public class SimpleLinkScraper implements AutoCloseable {
                }

                byte[] body = getResponseData(response);
-                if (body.length > 1024 * 1024) {
+                if (body.length > MAX_SIZE) {
                    return new FetchResult.Error(parsedUrl);
                }

--- a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
+++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
@@ -3,8 +3,8 @@ package nu.marginalia.livecrawler;
 import nu.marginalia.db.DomainBlacklistImpl;
 import nu.marginalia.io.SerializableCrawlDataStream;
 import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawldata.CrawledDocument;
-import nu.marginalia.model.crawldata.CrawledDomain;
 import org.apache.commons.io.FileUtils;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Assertions;
@@ -38,7 +38,8 @@ class SimpleLinkScraperTest {
    @Test
    public void testRetrieveNow() throws Exception {
        var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
-        scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
+        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
+        Assertions.assertEquals(1, fetched);

        var streams = dataSet.getDataStreams();
        Assertions.assertEquals(1, streams.size());
@@ -46,23 +47,20 @@ class SimpleLinkScraperTest {
        SerializableCrawlDataStream firstStream = streams.iterator().next();
        Assertions.assertTrue(firstStream.hasNext());

-        if (firstStream.next() instanceof CrawledDomain domain) {
-            Assertions.assertEquals("www.marginalia.nu",domain.getDomain());
-        }
-        else {
-            Assertions.fail();
-        }
+        List<CrawledDocument> documents = firstStream.docsAsList();
+        Assertions.assertEquals(1, documents.size());
+        Assertions.assertTrue(documents.getFirst().documentBody.startsWith("<!doctype"));
+    }

-        Assertions.assertTrue(firstStream.hasNext());

-        if ((firstStream.next() instanceof CrawledDocument document)) {
-            // verify we decompress the body string
-            Assertions.assertTrue(document.documentBody.startsWith("<!doctype"));
-        }
-        else{
-            Assertions.fail();
-        }

-        Assertions.assertFalse(firstStream.hasNext());
+    @Test
+    public void testRetrieveNow_Redundant() throws Exception {
+        dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
+        var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
+
+        // If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
+        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
+        Assertions.assertEquals(0, fetched);
    }
 }
--- a/code/services-application/search-service/resources/templates/search/index/index-redesign.hdb
+++ b/code/services-application/search-service/resources/templates/search/index/index-redesign.hdb
@@ -0,0 +1,14 @@
+<section id="frontpage-tips">
+    <h2>Public Beta Available</h2>
+    <div class="info">
+        <p>
+            A redesigned version of the search engine UI is available for beta testing.
+            Feel free to give it a spin, feedback is welcome!
+            The old one will also be keep being available if you hate it,
+            or have compatibility issues.
+        </p>
+        <p>
+            <a href="https://test.marginalia.nu/">Try it out!</a>
+        </p>
+    </div>
+</section>
--- a/code/services-application/search-service/resources/templates/search/index/index.hdb
+++ b/code/services-application/search-service/resources/templates/search/index/index.hdb
@@ -24,7 +24,7 @@
 <section id="frontpage">
 {{>search/index/index-news}}
 {{>search/index/index-about}}
-{{>search/index/index-tips}}
+{{>search/index/index-redesign}}
 </section>

 {{>search/parts/search-footer}}
--- a/tools/deployment/deployment.py
+++ b/tools/deployment/deployment.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 from dataclasses import dataclass
 import subprocess, os
 from typing import List, Set, Dict, Optional
Author	SHA1	Message	Date
Viktor Lofgren	0ea8092350	(search) Add link promoting the redesign beta	2024-12-30 15:47:13 +01:00
Viktor Lofgren	483d29497e	(deploy) Add hashbang to deploy script	2024-12-30 15:47:13 +01:00
Viktor Lofgren	bae44497fe	(crawler) Add a new system property crawler.maxFetchSize This gives the same upper limit to the live crawler and the big boy crawler, though the live crawler will reject items too large, and the big crawler will truncate at that point.	2024-12-30 15:10:11 +01:00
Viktor Lofgren	0d59202aca	(crawler) Do not remove W/-prefix on weak e-tags The server expects to get them back prefixed, as we received them.	2024-12-27 20:56:42 +01:00
Viktor Lofgren	0ca43f0c9c	(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests.	2024-12-27 20:54:42 +01:00