1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

5 Commits

Author SHA1 Message Date
Viktor Lofgren
4bb71b8439 (crawler) Correct content type probing to only run on URLs that are suspected to be binary 2024-12-26 14:26:23 +01:00
Viktor Lofgren
e4a41f7dd1 (crawler) Correct content type probing to only run on URLs that are suspected to be binary 2024-12-26 14:13:17 +01:00
Viktor
69ad6287b1 Update ROADMAP.md 2024-12-25 21:16:38 +00:00
Viktor Lofgren
41a59dcf45 (feed) Sanitize illegal HTML entities out of the feed XML before parsing 2024-12-25 14:53:28 +01:00
Viktor Lofgren
94d4d2edb7 (live-crawler) Add refresh date to feeds API
For now this is just the ctime for the feeds db.  We may want to store this per-record in the future.
2024-12-25 14:20:48 +01:00
10 changed files with 97 additions and 22 deletions

View File

@@ -21,7 +21,7 @@ word n-grams known beforehand. This limits the ability to interpret longer quer
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
list, as is the civilized way of doing this. list, as is the civilized way of doing this.
Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99 Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
## Hybridize crawler w/ Common Crawl data ## Hybridize crawler w/ Common Crawl data
@@ -41,6 +41,12 @@ The search engine has a bit of a problem showing spicy content mixed in with the
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) ) to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
combined with naive bayesian filter would go a long way, or something more sophisticated...? combined with naive bayesian filter would go a long way, or something more sophisticated...?
## Web Design Overhaul
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
## Additional Language Support ## Additional Language Support
It would be desirable if the search engine supported more languages than English. This is partially about It would be desirable if the search engine supported more languages than English. This is partially about
@@ -56,7 +62,7 @@ it should be extended to all domains. It would also be interesting to offer sea
RSS data itself, or use the RSS set to feed a special live index that updates faster than the RSS data itself, or use the RSS set to feed a special live index that updates faster than the
main dataset. main dataset.
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
## Support for binary formats like PDF ## Support for binary formats like PDF

View File

@@ -59,12 +59,6 @@ public class FeedsClient {
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()))); .forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
} }
public record UpdatedDomain(String domain, List<String> urls) {
public UpdatedDomain(RpcUpdatedLinksResponse rsp) {
this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()));
}
}
/** Get the hash of the feed data, for identifying when the data has been updated */ /** Get the hash of the feed data, for identifying when the data has been updated */
public String getFeedDataHash() { public String getFeedDataHash() {
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash) return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)

View File

@@ -46,6 +46,7 @@ message RpcFeed {
string feedUrl = 3; string feedUrl = 3;
string updated = 4; string updated = 4;
repeated RpcFeedItem items = 5; repeated RpcFeedItem items = 5;
int64 fetchTimestamp = 6;
} }
message RpcFeedItem { message RpcFeedItem {

View File

@@ -12,9 +12,11 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption; import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.PosixFileAttributes;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.time.Instant; import java.time.Instant;
import java.util.Base64; import java.util.Base64;
@@ -209,4 +211,20 @@ public class FeedDb {
reader.getLinksUpdatedSince(since, consumer); reader.getLinksUpdatedSince(since, consumer);
} }
public Instant getFetchTime() {
if (!Files.exists(readerDbPath)) {
return Instant.ofEpochMilli(0);
}
try {
return Files.readAttributes(readerDbPath, PosixFileAttributes.class)
.creationTime()
.toInstant();
}
catch (IOException ex) {
logger.error("Failed to read the creatiom time of {}", readerDbPath);
return Instant.ofEpochMilli(0);
}
}
} }

View File

@@ -316,6 +316,8 @@ public class FeedFetcherService {
public FeedItems parseFeed(String feedData, FeedDefinition definition) { public FeedItems parseFeed(String feedData, FeedDefinition definition) {
try { try {
feedData = sanitizeEntities(feedData);
List<Item> rawItems = rssReader.read( List<Item> rawItems = rssReader.read(
// Massage the data to maximize the possibility of the flaky XML parser consuming it // Massage the data to maximize the possibility of the flaky XML parser consuming it
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false) new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
@@ -342,6 +344,32 @@ public class FeedFetcherService {
} }
} }
private static final Map<String, String> HTML_ENTITIES = Map.of(
"&raquo;", "»",
"&laquo;", "«",
"&mdash;", "--",
"&ndash;", "-",
"&rsquo;", "'",
"&lsquo;", "'",
"&nbsp;", ""
);
/** The XML parser will blow up if you insert HTML entities in the feed XML,
* which is unfortunately relatively common. Replace them as far as is possible
* with their corresponding characters
*/
static String sanitizeEntities(String feedData) {
String result = feedData;
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
result = result.replace(entry.getKey(), entry.getValue());
}
// Handle lone ampersands not part of a recognized XML entity
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&amp;");
return result;
}
/** Decide whether to keep URI fragments in the feed items. /** Decide whether to keep URI fragments in the feed items.
* <p></p> * <p></p>
* We keep fragments if there are multiple different fragments in the items. * We keep fragments if there are multiple different fragments in the items.

View File

@@ -107,8 +107,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
@Override @Override
public void getFeed(RpcDomainId request, public void getFeed(RpcDomainId request,
StreamObserver<RpcFeed> responseObserver) StreamObserver<RpcFeed> responseObserver) {
{
if (!feedDb.isEnabled()) { if (!feedDb.isEnabled()) {
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node")); responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
return; return;
@@ -126,7 +125,8 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
.setDomainId(request.getDomainId()) .setDomainId(request.getDomainId())
.setDomain(domainName.get().toString()) .setDomain(domainName.get().toString())
.setFeedUrl(feedItems.feedUrl()) .setFeedUrl(feedItems.feedUrl())
.setUpdated(feedItems.updated()); .setUpdated(feedItems.updated())
.setFetchTimestamp(feedDb.getFetchTime().toEpochMilli());
for (var item : feedItems.items()) { for (var item : feedItems.items()) {
retB.addItemsBuilder() retB.addItemsBuilder()

View File

@@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
feedFetcherService.setDeterministic(); feedFetcherService.setDeterministic();
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty()); var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
System.out.println(result);
Assertions.assertFalse(result.isEmpty());
} }
@Tag("flaky") @Tag("flaky")

View File

@@ -0,0 +1,26 @@
package nu.marginalia.rss.svc;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class TestXmlSanitization {
@Test
public void testPreservedEntities() {
Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;"));
Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;"));
Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;"));
Assertions.assertEquals("&quot;", FeedFetcherService.sanitizeEntities("&quot;"));
Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;"));
}
@Test
public void testStrayAmpersand() {
Assertions.assertEquals("Bed &amp; Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
}
@Test
public void testTranslatedHtmlEntity() {
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar"));
}
}

View File

@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
public ContentTypeProbeResult probeContentType(EdgeUrl url, public ContentTypeProbeResult probeContentType(EdgeUrl url,
WarcRecorder warcRecorder, WarcRecorder warcRecorder,
ContentTags tags) throws RateLimitException { ContentTags tags) throws RateLimitException {
if (tags.isEmpty()) { if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
var headBuilder = new Request.Builder().head() var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgentString) .addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip") .addHeader("Accept-Encoding", "gzip")

View File

@@ -42,24 +42,24 @@ class ContentTypeProberTest {
port = r.nextInt(10000) + 8000; port = r.nextInt(10000) + 8000;
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10); server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
server.createContext("/html", exchange -> { server.createContext("/html.gz", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html"); exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, -1); exchange.sendResponseHeaders(200, -1);
exchange.close(); exchange.close();
}); });
server.createContext("/redir", exchange -> { server.createContext("/redir.gz", exchange -> {
exchange.getResponseHeaders().add("Location", "/html"); exchange.getResponseHeaders().add("Location", "/html.gz");
exchange.sendResponseHeaders(301, -1); exchange.sendResponseHeaders(301, -1);
exchange.close(); exchange.close();
}); });
server.createContext("/bin", exchange -> { server.createContext("/bin.gz", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "application/binary"); exchange.getResponseHeaders().add("Content-Type", "application/binary");
exchange.sendResponseHeaders(200, -1); exchange.sendResponseHeaders(200, -1);
exchange.close(); exchange.close();
}); });
server.createContext("/timeout", exchange -> { server.createContext("/timeout.gz", exchange -> {
try { try {
Thread.sleep(15_000); Thread.sleep(15_000);
} catch (InterruptedException e) { } catch (InterruptedException e) {
@@ -73,10 +73,10 @@ class ContentTypeProberTest {
server.start(); server.start();
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get(); htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html.gz").get();
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get(); binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin.gz").get();
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get(); timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout.gz").get();
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get(); htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
fetcher = new HttpFetcherImpl("test"); fetcher = new HttpFetcherImpl("test");
recorder = new WarcRecorder(warcFile); recorder = new WarcRecorder(warcFile);