(search) Further reduce the number of db queries by adding more caching to DbDomainQueries.

(search) Reduce the number of db queries a bit by caching data that doesn't change too often
2025-10-05 21:22:39 +02:00 · 2025-01-10 14:15:22 +01:00 · 2025-01-10 13:56:04 +01:00 · 2025-01-10 13:53:56 +01:00 · 2025-01-10 13:46:19 +01:00 · 2025-01-09 20:20:51 +01:00
15 changed files with 329 additions and 322 deletions
--- a/code/common/db/java/nu/marginalia/db/DbDomainQueries.java
+++ b/code/common/db/java/nu/marginalia/db/DbDomainQueries.java
@@ -20,7 +20,10 @@ public class DbDomainQueries {
    private final HikariDataSource dataSource;

    private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
+
    private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
+    private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
+    private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();

    @Inject
    public DbDomainQueries(HikariDataSource dataSource)
@@ -30,16 +33,21 @@ public class DbDomainQueries {


    public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
-        try (var connection = dataSource.getConnection()) {
-
+        try {
            return domainIdCache.get(domain, () -> {
-                try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+                try (var connection = dataSource.getConnection();
+                     var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
+
                    stmt.setString(1, domain.toString());
                    var rsp = stmt.executeQuery();
                    if (rsp.next()) {
                        return rsp.getInt(1);
                    }
                }
+                catch (SQLException ex) {
+                    throw new RuntimeException(ex);
+                }
+
                throw new NoSuchElementException();
            });
        }
@@ -49,9 +57,6 @@ public class DbDomainQueries {
        catch (ExecutionException ex) {
            throw new RuntimeException(ex.getCause());
        }
-        catch (SQLException ex) {
-            throw new RuntimeException(ex);
-        }
    }

    public OptionalInt tryGetDomainId(EdgeDomain domain) {
@@ -84,47 +89,55 @@ public class DbDomainQueries {
    }

    public Optional<EdgeDomain> getDomain(int id) {
-        try (var connection = dataSource.getConnection()) {

+        EdgeDomain existing = domainNameCache.getIfPresent(id);
+        if (existing != null) {
+            return Optional.of(existing);
+        }
+
+        try (var connection = dataSource.getConnection()) {
            try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
                stmt.setInt(1, id);
                var rsp = stmt.executeQuery();
                if (rsp.next()) {
-                    return Optional.of(new EdgeDomain(rsp.getString(1)));
+                    var val = new EdgeDomain(rsp.getString(1));
+                    domainNameCache.put(id, val);
+                    return Optional.of(val);
                }
                return Optional.empty();
            }
        }
-        catch (UncheckedExecutionException ex) {
-            throw new RuntimeException(ex.getCause());
-        }
        catch (SQLException ex) {
            throw new RuntimeException(ex);
        }
    }

-    public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) {
-        List<DomainWithNode> ret = new ArrayList<>();
+    public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
+        String topDomain = domain.topDomain;

-        try (var conn = dataSource.getConnection();
-             var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
-            stmt.setString(1, domain.topDomain);
-            stmt.setInt(2, cnt);
+        return siblingsCache.get(topDomain, () -> {
+            List<DomainWithNode> ret = new ArrayList<>();

-            var rs = stmt.executeQuery();
-            while (rs.next()) {
-                var sibling = new EdgeDomain(rs.getString(1));
+            try (var conn = dataSource.getConnection();
+                 var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
+                stmt.setString(1, topDomain);
+                stmt.setInt(2, cnt);

-                if (sibling.equals(domain))
-                    continue;
+                var rs = stmt.executeQuery();
+                while (rs.next()) {
+                    var sibling = new EdgeDomain(rs.getString(1));

-                ret.add(new DomainWithNode(sibling, rs.getInt(2)));
+                    if (sibling.equals(domain))
+                        continue;
+
+                    ret.add(new DomainWithNode(sibling, rs.getInt(2)));
+                }
+            } catch (SQLException e) {
+                logger.error("Failed to get domain neighbors");
            }
-        } catch (SQLException e) {
-            logger.error("Failed to get domain neighbors");
-        }
+            return ret;
+        });

-        return ret;
    }

    public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
--- a/code/common/db/java/nu/marginalia/db/DbDomainStatsExportMultitool.java
+++ b/code/common/db/java/nu/marginalia/db/DbDomainStatsExportMultitool.java
@@ -1,118 +0,0 @@
-package nu.marginalia.db;
-
-import com.zaxxer.hikari.HikariDataSource;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.OptionalInt;
-
-/** Class used in exporting data.  This is intended to be used for a brief time
- * and then discarded, not kept around as a service.
- */
-public class DbDomainStatsExportMultitool implements AutoCloseable {
-    private final Connection connection;
-    private final int nodeId;
-    private final PreparedStatement knownUrlsQuery;
-    private final PreparedStatement visitedUrlsQuery;
-    private final PreparedStatement goodUrlsQuery;
-    private final PreparedStatement domainNameToId;
-
-    private final PreparedStatement allDomainsQuery;
-    private final PreparedStatement crawlQueueDomains;
-    private final PreparedStatement indexedDomainsQuery;
-
-    public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
-        this.connection = dataSource.getConnection();
-        this.nodeId = nodeId;
-
-        knownUrlsQuery = connection.prepareStatement("""
-                SELECT KNOWN_URLS
-                FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
-                    ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
-                WHERE DOMAIN_NAME=?
-                """);
-        visitedUrlsQuery = connection.prepareStatement("""
-                SELECT VISITED_URLS
-                FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
-                    ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
-                WHERE DOMAIN_NAME=?
-                """);
-        goodUrlsQuery = connection.prepareStatement("""
-                SELECT GOOD_URLS
-                FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
-                    ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
-                WHERE DOMAIN_NAME=?
-                """);
-        domainNameToId = connection.prepareStatement("""
-                SELECT ID
-                FROM EC_DOMAIN
-                WHERE DOMAIN_NAME=?
-                """);
-        allDomainsQuery = connection.prepareStatement("""
-                SELECT DOMAIN_NAME
-                FROM EC_DOMAIN
-                """);
-        crawlQueueDomains = connection.prepareStatement("""
-                SELECT DOMAIN_NAME
-                FROM CRAWL_QUEUE
-                """);
-        indexedDomainsQuery = connection.prepareStatement("""
-                SELECT DOMAIN_NAME
-                FROM EC_DOMAIN
-                WHERE INDEXED > 0
-                """);
-    }
-
-    public OptionalInt getVisitedUrls(String domainName) throws SQLException {
-        return executeNameToIntQuery(domainName, visitedUrlsQuery);
-    }
-
-    public OptionalInt getDomainId(String domainName) throws SQLException {
-        return executeNameToIntQuery(domainName, domainNameToId);
-    }
-
-    public List<String> getCrawlQueueDomains() throws SQLException {
-        return executeListQuery(crawlQueueDomains, 100);
-    }
-    public List<String> getAllIndexedDomains() throws SQLException {
-        return executeListQuery(indexedDomainsQuery, 100_000);
-    }
-
-    private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
-            throws SQLException {
-        statement.setString(1, domainName);
-        var rs = statement.executeQuery();
-
-        if (rs.next()) {
-            return OptionalInt.of(rs.getInt(1));
-        }
-
-        return OptionalInt.empty();
-    }
-
-    private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
-        List<String> ret = new ArrayList<>(sizeHint);
-
-        var rs = statement.executeQuery();
-
-        while (rs.next()) {
-            ret.add(rs.getString(1));
-        }
-
-        return ret;
-    }
-
-    @Override
-    public void close() throws SQLException {
-        knownUrlsQuery.close();
-        goodUrlsQuery.close();
-        visitedUrlsQuery.close();
-        allDomainsQuery.close();
-        crawlQueueDomains.close();
-        domainNameToId.close();
-        connection.close();
-    }
-}
--- a/code/common/service/java/nu/marginalia/service/module/DatabaseModule.java
+++ b/code/common/service/java/nu/marginalia/service/module/DatabaseModule.java
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
            config.addDataSourceProperty("prepStmtCacheSize", "250");
            config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");

-            config.setMaximumPoolSize(5);
+            config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
            config.setMinimumIdle(2);

            config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@@ -29,6 +29,7 @@ dependencies {
    implementation libs.jsoup
    implementation project(':third-party:rssreader')
    implementation libs.opencsv
+    implementation libs.slop
    implementation libs.sqlite
    implementation libs.bundles.slf4j
    implementation libs.commons.lang3
--- a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
+++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java
@@ -15,7 +15,9 @@ import java.util.Map;

 /** Client for local browserless.io API */
 public class BrowserlessClient implements AutoCloseable {
+
    private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
+    private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");

    private final HttpClient httpClient = HttpClient.newBuilder()
            .version(HttpClient.Version.HTTP_1_1)
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
        );

        var request = HttpRequest.newBuilder()
-                .uri(browserlessURI.resolve("/content"))
+                .uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
                .method("POST", HttpRequest.BodyPublishers.ofString(
                        gson.toJson(requestData)
                ))
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
        );

        var request = HttpRequest.newBuilder()
-                .uri(browserlessURI.resolve("/screenshot"))
+                .uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
                .method("POST", HttpRequest.BodyPublishers.ofString(
                        gson.toJson(requestData)
                ))
--- a/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/model/FeedItem.java
@@ -1,6 +1,6 @@
 package nu.marginalia.rss.model;

-import com.apptasticsoftware.rssreader.Item;
+import nu.marginalia.rss.svc.SimpleFeedParser;
 import org.apache.commons.lang3.StringUtils;
 import org.jetbrains.annotations.NotNull;
 import org.jsoup.Jsoup;
@@ -18,37 +18,33 @@ public record FeedItem(String title,
    public static final int MAX_DESC_LENGTH = 255;
    public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");

-    public static FeedItem fromItem(Item item, boolean keepFragment) {
-        String title = item.getTitle().orElse("");
+    public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
+        String title = item.title();
        String date = getItemDate(item);
        String description = getItemDescription(item);
        String url;

-        if (keepFragment || item.getLink().isEmpty()) {
-            url = item.getLink().orElse("");
+        if (keepFragment) {
+            url = item.url();
        }
        else {
            try {
-                String link = item.getLink().get();
+                String link = item.url();
                var linkUri = new URI(link);
                var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
                url = cleanUri.toString();
            }
            catch (Exception e) {
                // fallback to original link if we can't clean it, this is not a very important step
-                url = item.getLink().get();
+                url = item.url();
            }
        }

        return new FeedItem(title, date, description, url);
    }

-    private static String getItemDescription(Item item) {
-        Optional<String> description = item.getDescription();
-        if (description.isEmpty())
-            return "";
-
-        String rawDescription = description.get();
+    private static String getItemDescription(SimpleFeedParser.ItemData item) {
+        String rawDescription = item.description();
        if (rawDescription.indexOf('<') >= 0) {
            rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
        }
@@ -58,15 +54,18 @@ public record FeedItem(String title,

    // e.g. http://fabiensanglard.net/rss.xml does dates like this:  1 Apr 2021 00:00:00 +0000
    private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
-    private static String getItemDate(Item item) {
+    private static String getItemDate(SimpleFeedParser.ItemData item) {
        Optional<ZonedDateTime> zonedDateTime = Optional.empty();
        try {
            zonedDateTime = item.getPubDateZonedDateTime();
        }
        catch (Exception e) {
-            zonedDateTime = item.getPubDate()
-                    .map(extraFormatter::parse)
-                    .map(ZonedDateTime::from);
+            try {
+                zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
+            }
+            catch (Exception e2) {
+                // ignore
+            }
        }

        return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -1,7 +1,5 @@
 package nu.marginalia.rss.svc;

-import com.apptasticsoftware.rssreader.Item;
-import com.apptasticsoftware.rssreader.RssReader;
 import com.google.inject.Inject;
 import com.opencsv.CSVReader;
 import nu.marginalia.WmsaHome;
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
 import nu.marginalia.storage.model.FileStorage;
 import nu.marginalia.storage.model.FileStorageType;
 import nu.marginalia.util.SimpleBlockingThreadPool;
-import org.apache.commons.io.input.BOMInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
-import java.nio.charset.StandardCharsets;
 import java.sql.SQLException;
 import java.time.*;
 import java.time.format.DateTimeFormatter;
@@ -48,8 +44,6 @@ public class FeedFetcherService {
    private static final int MAX_FEED_ITEMS = 10;
    private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);

-    private final RssReader rssReader = new RssReader();
-
    private final FeedDb feedDb;
    private final FileStorageService fileStorageService;
    private final NodeConfigurationService nodeConfigurationService;
@@ -72,17 +66,6 @@ public class FeedFetcherService {
        this.nodeConfigurationService = nodeConfigurationService;
        this.serviceHeartbeat = serviceHeartbeat;
        this.executorClient = executorClient;
-
-
-        // Add support for some alternate date tags for atom
-        rssReader.addItemExtension("issued", this::setDateFallback);
-        rssReader.addItemExtension("created", this::setDateFallback);
-    }
-
-    private void setDateFallback(Item item, String value) {
-        if (item.getPubDate().isEmpty()) {
-            item.setPubDate(value);
-        }
    }

    public enum UpdateMode {
@@ -96,6 +79,7 @@ public class FeedFetcherService {
            throw new IllegalStateException("Already updating feeds, refusing to start another update");
        }

+
        try (FeedDbWriter writer = feedDb.createWriter();
             HttpClient client = HttpClient.newBuilder()
                .connectTimeout(Duration.ofSeconds(15))
@@ -103,6 +87,7 @@ public class FeedFetcherService {
                .followRedirects(HttpClient.Redirect.NORMAL)
                .version(HttpClient.Version.HTTP_2)
                .build();
+             FeedJournal feedJournal = FeedJournal.create();
             var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
        ) {
            updating = true;
@@ -155,6 +140,8 @@ public class FeedFetcherService {
                            case FetchResult.Success(String value, String etag) -> {
                                writer.saveEtag(feed.domain(), etag);
                                writer.saveFeed(parseFeed(value, feed));
+
+                                feedJournal.record(feed.feedUrl(), value);
                            }
                            case FetchResult.NotModified() -> {
                                writer.saveEtag(feed.domain(), ifNoneMatchTag);
@@ -367,12 +354,7 @@ public class FeedFetcherService {

    public FeedItems parseFeed(String feedData, FeedDefinition definition) {
        try {
-            feedData = sanitizeEntities(feedData);
-
-            List<Item> rawItems = rssReader.read(
-                    // Massage the data to maximize the possibility of the flaky XML parser consuming it
-                    new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
-            ).toList();
+            List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);

            boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);

@@ -395,33 +377,6 @@ public class FeedFetcherService {
        }
    }

-    private static final Map<String, String> HTML_ENTITIES = Map.of(
-            "&raquo;", "»",
-            "&laquo;", "«",
-            "&mdash;", "--",
-            "&ndash;", "-",
-            "&rsquo;", "'",
-            "&lsquo;", "'",
-            "&quot;", "\"",
-            "&nbsp;", ""
-    );
-
-    /** The XML parser will blow up if you insert HTML entities in the feed XML,
-     * which is unfortunately relatively common.  Replace them as far as is possible
-     * with their corresponding characters
-     */
-    static String sanitizeEntities(String feedData) {
-        String result = feedData;
-        for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
-            result = result.replace(entry.getKey(), entry.getValue());
-        }
-
-        // Handle lone ampersands not part of a recognized XML entity
-        result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&amp;");
-
-        return result;
-    }
-
    /** Decide whether to keep URI fragments in the feed items.
     * <p></p>
     * We keep fragments if there are multiple different fragments in the items.
@@ -429,16 +384,16 @@ public class FeedFetcherService {
     * @param items The items to check
     * @return True if we should keep the fragments, false otherwise
     */
-    private boolean areFragmentsDisparate(List<Item> items) {
+    private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
        Set<String> seenFragments = new HashSet<>();

        try {
            for (var item : items) {
-                if (item.getLink().isEmpty()) {
+                if (item.url().isBlank()) {
                    continue;
                }

-                var link = item.getLink().get();
+                var link = item.url();
                if (!link.contains("#")) {
                    continue;
                }
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedJournal.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedJournal.java
@@ -0,0 +1,76 @@
+package nu.marginalia.rss.svc;
+
+import nu.marginalia.WmsaHome;
+import nu.marginalia.slop.SlopTable;
+import nu.marginalia.slop.column.string.StringColumn;
+import nu.marginalia.slop.desc.StorageType;
+import org.apache.commons.io.FileUtils;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.function.BiConsumer;
+
+/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
+ */
+public interface FeedJournal extends AutoCloseable {
+    StringColumn urlColumn = new StringColumn("url");
+    StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
+
+    void record(String url, String contents) throws IOException;
+    void close() throws IOException;
+
+
+    static FeedJournal create() throws IOException {
+        if (Boolean.getBoolean("feedFetcher.persistJournal")) {
+            Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
+            if (Files.isDirectory(journalPath)) {
+                FileUtils.deleteDirectory(journalPath.toFile());
+            }
+            Files.createDirectories(journalPath);
+            return new RecordingFeedJournal(journalPath);
+        }
+        else {
+            return new NoOpFeedJournal();
+        }
+    }
+
+    class NoOpFeedJournal implements FeedJournal {
+        @Override
+        public void record(String url, String contents) {}
+
+        @Override
+        public void close() {}
+    }
+
+    class RecordingFeedJournal extends SlopTable implements FeedJournal {
+
+        private final StringColumn.Writer urlWriter;
+        private final StringColumn.Writer contentsWriter;
+
+        public RecordingFeedJournal(Path path) throws IOException {
+            super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
+
+            urlWriter = urlColumn.create(this);
+            contentsWriter = contentsColumn.create(this);
+        }
+
+        public synchronized void record(String url, String contents) throws IOException {
+            urlWriter.put(url);
+            contentsWriter.put(contents);
+        }
+    }
+
+    static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
+        try (SlopTable table = new SlopTable(journalPath)) {
+            final StringColumn.Reader urlReader = urlColumn.open(table);
+            final StringColumn.Reader contentsReader = contentsColumn.open(table);
+
+            while (urlReader.hasRemaining()) {
+                urlAndContent.accept(urlReader.get(), contentsReader.get());
+            }
+        }
+
+    }
+}
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/SimpleFeedParser.java
@@ -0,0 +1,94 @@
+package nu.marginalia.rss.svc;
+
+import com.apptasticsoftware.rssreader.DateTimeParser;
+import com.apptasticsoftware.rssreader.util.Default;
+import org.jsoup.Jsoup;
+import org.jsoup.parser.Parser;
+
+import java.time.ZonedDateTime;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+public class SimpleFeedParser {
+
+    private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
+
+    public record ItemData (
+            String title,
+            String description,
+            String url,
+            String pubDate
+    ) {
+        public boolean isWellFormed() {
+            return title != null && !title.isBlank() &&
+                    description != null && !description.isBlank() &&
+                    url != null && !url.isBlank() &&
+                    pubDate != null && !pubDate.isBlank();
+        }
+
+        public Optional<ZonedDateTime> getPubDateZonedDateTime() {
+            try {
+                return Optional.ofNullable(dateTimeParser.parse(pubDate()));
+            }
+            catch (Exception e) {
+                return Optional.empty();
+            }
+        }
+
+    }
+
+    public static List<ItemData> parse(String content) {
+        var doc = Jsoup.parse(content, Parser.xmlParser());
+        List<ItemData> ret = new ArrayList<>();
+
+        doc.select("item, entry").forEach(element -> {
+            String link = "";
+            String title = "";
+            String description = "";
+            String pubDate = "";
+
+            for (String attr : List.of("title", "dc:title")) {
+                if (!title.isBlank())
+                    break;
+                var tag = element.getElementsByTag(attr).first();
+                if (tag != null) {
+                    title = tag.text();
+                }
+            }
+
+            for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
+                if (!description.isBlank())
+                    break;
+                var tag = element.getElementsByTag(attr).first();
+                if (tag != null) {
+                    description = tag.text();
+                }
+            }
+
+            for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
+                if (!pubDate.isBlank())
+                    break;
+                var tag = element.getElementsByTag(attr).first();
+                if (tag != null) {
+                    pubDate = tag.text();
+                }
+            }
+
+            for (String attr : List.of("link", "url")) {
+                if (!link.isBlank())
+                    break;
+                var tag = element.getElementsByTag(attr).first();
+                if (tag != null) {
+                    link = tag.text();
+                }
+            }
+
+            ret.add(new ItemData(title, description, link, pubDate));
+        });
+
+
+        return ret;
+    }
+
+}
--- a/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java
@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;

 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.junit.jupiter.Testcontainers;
 import org.testcontainers.utility.DockerImageName;

 import java.net.URI;
+import java.util.Map;

@Testcontainers
+@Tag("slow")
 public class BrowserlessClientTest {
-    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
+    static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
+            .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
+            .withExposedPorts(3000);

    @BeforeAll
    public static void setup() {
--- a/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java
+++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java
@@ -1,50 +0,0 @@
-package nu.marginalia.rss.svc;
-
-import com.apptasticsoftware.rssreader.Item;
-import com.apptasticsoftware.rssreader.RssReader;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-
-import java.util.List;
-import java.util.Optional;
-
-public class TestXmlSanitization {
-
-    @Test
-    public void testPreservedEntities() {
-        Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;"));
-        Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;"));
-        Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;"));
-        Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;"));
-    }
-
-    @Test
-    public void testNlnetTitleTag() {
-        // The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
-
-        // Verify we're able to consume and strip out the HTML tags
-        RssReader r = new RssReader();
-
-        List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
-
-        Assertions.assertEquals(1, items.size());
-        for (var item : items) {
-            Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
-        }
-    }
-
-    @Test
-    public void testStrayAmpersand() {
-        Assertions.assertEquals("Bed &amp; Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
-    }
-
-    @Test
-    public void testTranslatedHtmlEntity() {
-        Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar"));
-    }
-
-    @Test
-    public void testTranslatedHtmlEntityQuot() {
-        Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities("&quot;Bob&quot;"));
-    }
-}
--- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
@@ -67,8 +68,12 @@ public class SearchSiteInfoService {
        this.screenshotService = screenshotService;
        this.dataSource = dataSource;
        this.searchSiteSubscriptions = searchSiteSubscriptions;
+
+        Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
    }

+    private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
+
    @GET
    @Path("/site")
    public ModelAndView<?> handleOverview(@QueryParam String domain) {
@@ -77,23 +82,48 @@ public class SearchSiteInfoService {
            return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
        }

-        List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
-
-        try (var conn = dataSource.getConnection();
-             var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
-
-            var rs = stmt.executeQuery();
-            while (rs.next()) {
-                domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
-            }
-        }
-        catch (SQLException ex) {
-            throw new RuntimeException();
-        }
-
        return new MapModelAndView("siteinfo/start.jte",
                Map.of("navbar", NavbarModel.SITEINFO,
-                        "model", new SiteOverviewModel(domains)));
+                        "model", cachedOverviewModel));
+    }
+
+    private void modelUpdater() {
+        while (!Thread.interrupted()) {
+            List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
+
+            // This query can be quite expensive, so we can't run it on demand
+            // for every request. Instead, we run it every 15 minutes and cache
+            // the result.
+
+            try (var conn = dataSource.getConnection();
+                 var stmt = conn.prepareStatement("""
+                    SELECT DOMAIN_NAME, DISCOVER_DATE
+                    FROM EC_DOMAIN
+                    WHERE NODE_AFFINITY = 0
+                    ORDER BY ID DESC
+                    LIMIT 10
+                    """))
+            {
+                var rs = stmt.executeQuery();
+                while (rs.next()) {
+                    domains.add(new SiteOverviewModel.DiscoveredDomain(
+                            rs.getString("DOMAIN_NAME"),
+                            rs.getString("DISCOVER_DATE"))
+                    );
+                }
+            } catch (SQLException ex) {
+                logger.warn("Failed to get recently added domains: {}", ex.getMessage());
+            }
+
+            cachedOverviewModel = new SiteOverviewModel(domains);
+
+            try {
+                TimeUnit.MINUTES.sleep(15);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                break;
+            }
+        }
    }

    public record SiteOverviewModel(List<DiscoveredDomain> domains) {
@@ -193,7 +223,7 @@ public class SearchSiteInfoService {
        );
    }

-    private SiteInfoWithContext listInfo(Context context, String domainName) {
+    private SiteInfoWithContext listInfo(Context context, String domainName) throws ExecutionException {

        var domain = new EdgeDomain(domainName);
        final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);
--- a/code/services-application/search-service/resources/jte/serp/start.jte
+++ b/code/services-application/search-service/resources/jte/serp/start.jte
@@ -34,12 +34,12 @@
 <div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
    <div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
        <div class="text-slate-700 dark:text-white text-sm p-4">
-            <div class="fas fa-wrench mr-1 text-margeblue dark:text-slate-200"></div>
-            This is the new design and home of Marginalia Search.  Migration to the new domain <pre class="inline text-red-800 dark:text-red-100">marginalia-search.com</pre> is currently <em>in progress</em>,
-            so mind that some things may be a bit broken for a day or two.  <a href="https://about.marginalia-search.com/article/redesign/"  class="underline text-liteblue dark:text-blue-200">Read more</a>.
+            <div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
+            This is the new design and home of Marginalia Search.
+            You can about what this entails <a href="https://about.marginalia-search.com/article/redesign/"  class="underline text-liteblue dark:text-blue-200">here</a>.
        <p class="my-4"></p>
-        If you have any issues or feedback regarding this change, please email
-        <a href="mailto:contact@marginalia-search.com" class="underline text-liteblue dark:text-blue-200">contact@marginalia-search.com</a>.
+        The old version of Marginalia Search remains available at
+        <a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
        </div>
    </div>
    <div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
--- a/code/services-application/search-service/resources/jte/siteinfo/view/overview.jte
+++ b/code/services-application/search-service/resources/jte/siteinfo/view/overview.jte
@@ -1,5 +1,4 @@
@import nu.marginalia.db.DbDomainQueries
-@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.*
@import nu.marginalia.search.model.UrlDetails
@@ -81,35 +80,6 @@

        @endif

-
-        @if (!siteInfo.siblingDomains().isEmpty())
-            <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
-                <i class="fas fa-globe"></i>
-                <span>Related Subdomains</span>
-            </div>
-
-            <table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
-                <thead>
-                <tr class="bg-gray-50 dark:bg-gray-700">
-                    <th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
-                </tr>
-                </thead>
-                <tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
-                @for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
-                    <tr>
-                        <td class="px-3 py-6 md:py-3 whitespace-nowrap">
-                            <a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
-
-                            @if (!sibling.isIndexed())
-                                <i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
-                            @endif
-                        </td>
-                    </tr>
-                @endfor
-                </tbody>
-            </table>
-        @endif
-
        @if (siteInfo.domainInformation().isUnknownDomain())
            <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
                <i class="fa-regular fa-circle-question"></i>
@@ -178,6 +148,36 @@
            </form>
        @endif

+
+        @if (!siteInfo.siblingDomains().isEmpty())
+            <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
+                <i class="fas fa-globe"></i>
+                <span>Related Subdomains</span>
+            </div>
+
+            <table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
+                <thead>
+                <tr class="bg-gray-50 dark:bg-gray-700">
+                    <th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
+                </tr>
+                </thead>
+                <tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
+                @for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
+                    <tr>
+                        <td class="px-3 py-6 md:py-3 whitespace-nowrap">
+                            <a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
+
+                            @if (!sibling.isIndexed())
+                                <i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
+                            @endif
+                        </td>
+                    </tr>
+                @endfor
+                </tbody>
+            </table>
+        @endif
+
+
        @if (siteInfo.isKnown())
            <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
                <i class="fas fa-chart-simple"></i>
--- a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
+++ b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java
@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
 public class ScreenshotCaptureToolMain {

    private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
-
+    private static final String BROWSERLESS_TOKEN = System.getenv("live-capture.browserless-token");
    public static void main(String[] args) {
        DatabaseModule databaseModule = new DatabaseModule(false);
        var ds = databaseModule.provideConnection();
@@ -107,7 +107,7 @@ public class ScreenshotCaptureToolMain {
            );

            var request = HttpRequest.newBuilder()
-                    .uri(new URI("http://browserless:3000/screenshot"))
+                    .uri(new URI("http://browserless:3000/screenshot?token=" + BROWSERLESS_TOKEN))
                    .method("POST", HttpRequest.BodyPublishers.ofString(
                            gson.toJson(requestData)
                    ))
Author	SHA1	Message	Date
Viktor Lofgren	59a8ea60f7	(search) Further reduce the number of db queries by adding more caching to DbDomainQueries.	2025-01-10 14:15:22 +01:00
Viktor Lofgren	aa9b1244ea	(search) Reduce the number of db queries a bit by caching data that doesn't change too often	2025-01-10 13:56:04 +01:00
Viktor Lofgren	2d17233366	(search) Reduce the number of db queries a bit by caching data that doesn't change too often	2025-01-10 13:53:56 +01:00
Viktor Lofgren	b245cc9f38	(search) Reduce the number of db queries a bit by caching data that doesn't change too often	2025-01-10 13:46:19 +01:00
Viktor Lofgren	6614d05bdf	(db) Make db pool size configurable	2025-01-09 20:20:51 +01:00
Viktor Lofgren	55aeb03c4a	(feeds) Replace rssreader based parsing with a custom jsoup based rss parser This solves some issues with the rssreader based parser, which was very picky about the XML being valid. Jsoup is much more lenient when parsing malformed XML.	2025-01-09 18:29:55 +01:00
Viktor Lofgren	faa589962f	(live-capture) Browserless now requires a token	2025-01-09 14:51:11 +01:00
Viktor Lofgren	c7edd6b39f	(live-capture) Browserless now requires a token	2025-01-09 14:46:05 +01:00
Viktor Lofgren	79da622e3b	(search) Update front page with new banner about move	2025-01-08 21:38:19 +01:00
Viktor Lofgren	3da8337ba6	(feeds) Add system property for exporting fetched feeds to a slop table for debugging	2025-01-08 20:49:16 +01:00