mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
2 Commits
deploy-000
...
deploy-000
Author | SHA1 | Date | |
---|---|---|---|
|
41a59dcf45 | ||
|
94d4d2edb7 |
@@ -59,12 +59,6 @@ public class FeedsClient {
|
||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||
}
|
||||
|
||||
public record UpdatedDomain(String domain, List<String> urls) {
|
||||
public UpdatedDomain(RpcUpdatedLinksResponse rsp) {
|
||||
this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()));
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||
public String getFeedDataHash() {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||
|
@@ -46,6 +46,7 @@ message RpcFeed {
|
||||
string feedUrl = 3;
|
||||
string updated = 4;
|
||||
repeated RpcFeedItem items = 5;
|
||||
int64 fetchTimestamp = 6;
|
||||
}
|
||||
|
||||
message RpcFeedItem {
|
||||
|
@@ -12,9 +12,11 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.PosixFileAttributes;
|
||||
import java.security.MessageDigest;
|
||||
import java.time.Instant;
|
||||
import java.util.Base64;
|
||||
@@ -209,4 +211,20 @@ public class FeedDb {
|
||||
|
||||
reader.getLinksUpdatedSince(since, consumer);
|
||||
}
|
||||
|
||||
public Instant getFetchTime() {
|
||||
if (!Files.exists(readerDbPath)) {
|
||||
return Instant.ofEpochMilli(0);
|
||||
}
|
||||
|
||||
try {
|
||||
return Files.readAttributes(readerDbPath, PosixFileAttributes.class)
|
||||
.creationTime()
|
||||
.toInstant();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read the creatiom time of {}", readerDbPath);
|
||||
return Instant.ofEpochMilli(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -316,6 +316,8 @@ public class FeedFetcherService {
|
||||
|
||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||
try {
|
||||
feedData = sanitizeEntities(feedData);
|
||||
|
||||
List<Item> rawItems = rssReader.read(
|
||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||
@@ -342,6 +344,32 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||
"»", "»",
|
||||
"«", "«",
|
||||
"—", "--",
|
||||
"–", "-",
|
||||
"’", "'",
|
||||
"‘", "'",
|
||||
" ", ""
|
||||
);
|
||||
|
||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||
* which is unfortunately relatively common. Replace them as far as is possible
|
||||
* with their corresponding characters
|
||||
*/
|
||||
static String sanitizeEntities(String feedData) {
|
||||
String result = feedData;
|
||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||
result = result.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Handle lone ampersands not part of a recognized XML entity
|
||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Decide whether to keep URI fragments in the feed items.
|
||||
* <p></p>
|
||||
* We keep fragments if there are multiple different fragments in the items.
|
||||
|
@@ -107,8 +107,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
|
||||
@Override
|
||||
public void getFeed(RpcDomainId request,
|
||||
StreamObserver<RpcFeed> responseObserver)
|
||||
{
|
||||
StreamObserver<RpcFeed> responseObserver) {
|
||||
if (!feedDb.isEnabled()) {
|
||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||
return;
|
||||
@@ -126,7 +125,8 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
||||
.setDomainId(request.getDomainId())
|
||||
.setDomain(domainName.get().toString())
|
||||
.setFeedUrl(feedItems.feedUrl())
|
||||
.setUpdated(feedItems.updated());
|
||||
.setUpdated(feedItems.updated())
|
||||
.setFetchTimestamp(feedDb.getFetchTime().toEpochMilli());
|
||||
|
||||
for (var item : feedItems.items()) {
|
||||
retB.addItemsBuilder()
|
||||
|
@@ -99,7 +99,9 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
feedFetcherService.setDeterministic();
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
|
||||
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
|
||||
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||
System.out.println(result);
|
||||
Assertions.assertFalse(result.isEmpty());
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestXmlSanitization {
|
||||
|
||||
@Test
|
||||
public void testPreservedEntities() {
|
||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrayAmpersand() {
|
||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntity() {
|
||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user