(crawler) Correct content type probing to only run on URLs that are suspected to be binary

Update ROADMAP.md
(feed) Sanitize illegal HTML entities out of the feed XML before parsing
2025-10-05 21:22:39 +02:00 · 2024-12-26 14:13:17 +01:00 · 2024-12-25 21:16:38 +00:00 · 2024-12-25 14:53:28 +01:00
5 changed files with 66 additions and 4 deletions
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -21,7 +21,7 @@ word n-grams known beforehand.  This limits the ability to interpret longer quer
 The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
 list, as is the civilized way of doing this.

-Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99
+Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)

 ## Hybridize crawler w/ Common Crawl data

@@ -41,6 +41,12 @@ The search engine has a bit of a problem showing spicy content mixed in with the
 to have a way to filter this out.  It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
 combined with naive bayesian filter would go a long way, or something more sophisticated...?

+## Web Design Overhaul
+
+The design is kinda clunky and hard to maintain, and needlessly outdated-looking.  
+
+In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)  -- demo available at https://test.marginalia.nu/
+
 ## Additional Language Support

 It would be desirable if the search engine supported more languages than English.  This is partially about
@@ -56,7 +62,7 @@ it should be extended to all domains.  It would also be interesting to offer sea
 RSS data itself, or use the RSS set to feed a special live index that updates faster than the
 main dataset. 

-Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) 
+Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)

 ## Support for binary formats like PDF

--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@@ -316,6 +316,8 @@ public class FeedFetcherService {

    public FeedItems parseFeed(String feedData, FeedDefinition definition) {
        try {
+            feedData = sanitizeEntities(feedData);
+
            List<Item> rawItems = rssReader.read(
                    // Massage the data to maximize the possibility of the flaky XML parser consuming it
                    new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
@@ -342,6 +344,32 @@ public class FeedFetcherService {
        }
    }

+    private static final Map<String, String> HTML_ENTITIES = Map.of(
+            "&raquo;", "»",
+            "&laquo;", "«",
+            "&mdash;", "--",
+            "&ndash;", "-",
+            "&rsquo;", "'",
+            "&lsquo;", "'",
+            "&nbsp;", ""
+    );
+
+    /** The XML parser will blow up if you insert HTML entities in the feed XML,
+     * which is unfortunately relatively common.  Replace them as far as is possible
+     * with their corresponding characters
+     */
+    static String sanitizeEntities(String feedData) {
+        String result = feedData;
+        for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
+            result = result.replace(entry.getKey(), entry.getValue());
+        }
+
+        // Handle lone ampersands not part of a recognized XML entity
+        result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&amp;");
+
+        return result;
+    }
+
    /** Decide whether to keep URI fragments in the feed items.
     * <p></p>
     * We keep fragments if there are multiple different fragments in the items.
--- a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java
+++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java
@@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
        feedFetcherService.setDeterministic();
        feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);

-        Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
+        var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
+        System.out.println(result);
+        Assertions.assertFalse(result.isEmpty());
    }

    @Tag("flaky")
--- a/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java
+++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java
@@ -0,0 +1,26 @@
+package nu.marginalia.rss.svc;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class TestXmlSanitization {
+
+    @Test
+    public void testPreservedEntities() {
+        Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;"));
+        Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;"));
+        Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;"));
+        Assertions.assertEquals("&quot;", FeedFetcherService.sanitizeEntities("&quot;"));
+        Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;"));
+    }
+
+    @Test
+    public void testStrayAmpersand() {
+        Assertions.assertEquals("Bed &amp; Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
+    }
+
+    @Test
+    public void testTranslatedHtmlEntity() {
+        Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar"));
+    }
+}
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java
@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
    public ContentTypeProbeResult probeContentType(EdgeUrl url,
                                                   WarcRecorder warcRecorder,
                                                   ContentTags tags) throws RateLimitException {
-        if (tags.isEmpty()) {
+        if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
            var headBuilder = new Request.Builder().head()
                    .addHeader("User-agent", userAgentString)
                    .addHeader("Accept-Encoding", "gzip")
Author	SHA1	Message	Date
Viktor Lofgren	e4a41f7dd1	(crawler) Correct content type probing to only run on URLs that are suspected to be binary	2024-12-26 14:13:17 +01:00
Viktor	69ad6287b1	Update ROADMAP.md	2024-12-25 21:16:38 +00:00
Viktor Lofgren	41a59dcf45	(feed) Sanitize illegal HTML entities out of the feed XML before parsing	2024-12-25 14:53:28 +01:00