mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
4 Commits
deploy-004
...
deploy-005
Author | SHA1 | Date | |
---|---|---|---|
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f |
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||||
|
|
||||||
config.setMaximumPoolSize(5);
|
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||||
config.setMinimumIdle(2);
|
config.setMinimumIdle(2);
|
||||||
|
|
||||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||||
|
@@ -15,7 +15,9 @@ import java.util.Map;
|
|||||||
|
|
||||||
/** Client for local browserless.io API */
|
/** Client for local browserless.io API */
|
||||||
public class BrowserlessClient implements AutoCloseable {
|
public class BrowserlessClient implements AutoCloseable {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||||
|
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||||
|
|
||||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||||
.version(HttpClient.Version.HTTP_1_1)
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/content"))
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/screenshot"))
|
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.rss.model;
|
package nu.marginalia.rss.model;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
|||||||
public static final int MAX_DESC_LENGTH = 255;
|
public static final int MAX_DESC_LENGTH = 255;
|
||||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||||
|
|
||||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||||
String title = item.getTitle().orElse("");
|
String title = item.title();
|
||||||
String date = getItemDate(item);
|
String date = getItemDate(item);
|
||||||
String description = getItemDescription(item);
|
String description = getItemDescription(item);
|
||||||
String url;
|
String url;
|
||||||
|
|
||||||
if (keepFragment || item.getLink().isEmpty()) {
|
if (keepFragment) {
|
||||||
url = item.getLink().orElse("");
|
url = item.url();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try {
|
try {
|
||||||
String link = item.getLink().get();
|
String link = item.url();
|
||||||
var linkUri = new URI(link);
|
var linkUri = new URI(link);
|
||||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||||
url = cleanUri.toString();
|
url = cleanUri.toString();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
// fallback to original link if we can't clean it, this is not a very important step
|
// fallback to original link if we can't clean it, this is not a very important step
|
||||||
url = item.getLink().get();
|
url = item.url();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FeedItem(title, date, description, url);
|
return new FeedItem(title, date, description, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getItemDescription(Item item) {
|
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||||
Optional<String> description = item.getDescription();
|
String rawDescription = item.description();
|
||||||
if (description.isEmpty())
|
|
||||||
return "";
|
|
||||||
|
|
||||||
String rawDescription = description.get();
|
|
||||||
if (rawDescription.indexOf('<') >= 0) {
|
if (rawDescription.indexOf('<') >= 0) {
|
||||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||||
}
|
}
|
||||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
|||||||
|
|
||||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||||
private static String getItemDate(Item item) {
|
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||||
try {
|
try {
|
||||||
zonedDateTime = item.getPubDateZonedDateTime();
|
zonedDateTime = item.getPubDateZonedDateTime();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
zonedDateTime = item.getPubDate()
|
try {
|
||||||
.map(extraFormatter::parse)
|
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||||
.map(ZonedDateTime::from);
|
}
|
||||||
|
catch (Exception e2) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||||
|
@@ -1,7 +1,5 @@
|
|||||||
package nu.marginalia.rss.svc;
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
|
|||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
@@ -48,8 +44,6 @@ public class FeedFetcherService {
|
|||||||
private static final int MAX_FEED_ITEMS = 10;
|
private static final int MAX_FEED_ITEMS = 10;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||||
|
|
||||||
private final RssReader rssReader = new RssReader();
|
|
||||||
|
|
||||||
private final FeedDb feedDb;
|
private final FeedDb feedDb;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
@@ -72,17 +66,6 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
|
||||||
|
|
||||||
// Add support for some alternate date tags for atom
|
|
||||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
|
||||||
rssReader.addItemExtension("created", this::setDateFallback);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDateFallback(Item item, String value) {
|
|
||||||
if (item.getPubDate().isEmpty()) {
|
|
||||||
item.setPubDate(value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
@@ -371,12 +354,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
feedData = sanitizeEntities(feedData);
|
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
|
||||||
).toList();
|
|
||||||
|
|
||||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||||
|
|
||||||
@@ -399,33 +377,6 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
|
||||||
"»", "»",
|
|
||||||
"«", "«",
|
|
||||||
"—", "--",
|
|
||||||
"–", "-",
|
|
||||||
"’", "'",
|
|
||||||
"‘", "'",
|
|
||||||
""", "\"",
|
|
||||||
" ", ""
|
|
||||||
);
|
|
||||||
|
|
||||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
|
||||||
* which is unfortunately relatively common. Replace them as far as is possible
|
|
||||||
* with their corresponding characters
|
|
||||||
*/
|
|
||||||
static String sanitizeEntities(String feedData) {
|
|
||||||
String result = feedData;
|
|
||||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
|
||||||
result = result.replace(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle lone ampersands not part of a recognized XML entity
|
|
||||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
@@ -433,16 +384,16 @@ public class FeedFetcherService {
|
|||||||
* @param items The items to check
|
* @param items The items to check
|
||||||
* @return True if we should keep the fragments, false otherwise
|
* @return True if we should keep the fragments, false otherwise
|
||||||
*/
|
*/
|
||||||
private boolean areFragmentsDisparate(List<Item> items) {
|
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||||
Set<String> seenFragments = new HashSet<>();
|
Set<String> seenFragments = new HashSet<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (var item : items) {
|
for (var item : items) {
|
||||||
if (item.getLink().isEmpty()) {
|
if (item.url().isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var link = item.getLink().get();
|
var link = item.url();
|
||||||
if (!link.contains("#")) {
|
if (!link.contains("#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -10,6 +10,7 @@ import java.io.IOException;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||||
*/
|
*/
|
||||||
@@ -59,6 +60,17 @@ public interface FeedJournal extends AutoCloseable {
|
|||||||
urlWriter.put(url);
|
urlWriter.put(url);
|
||||||
contentsWriter.put(contents);
|
contentsWriter.put(contents);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||||
|
try (SlopTable table = new SlopTable(journalPath)) {
|
||||||
|
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||||
|
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||||
|
|
||||||
|
while (urlReader.hasRemaining()) {
|
||||||
|
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,94 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||||
|
import com.apptasticsoftware.rssreader.util.Default;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.parser.Parser;
|
||||||
|
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class SimpleFeedParser {
|
||||||
|
|
||||||
|
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||||
|
|
||||||
|
public record ItemData (
|
||||||
|
String title,
|
||||||
|
String description,
|
||||||
|
String url,
|
||||||
|
String pubDate
|
||||||
|
) {
|
||||||
|
public boolean isWellFormed() {
|
||||||
|
return title != null && !title.isBlank() &&
|
||||||
|
description != null && !description.isBlank() &&
|
||||||
|
url != null && !url.isBlank() &&
|
||||||
|
pubDate != null && !pubDate.isBlank();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||||
|
try {
|
||||||
|
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<ItemData> parse(String content) {
|
||||||
|
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||||
|
List<ItemData> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
doc.select("item, entry").forEach(element -> {
|
||||||
|
String link = "";
|
||||||
|
String title = "";
|
||||||
|
String description = "";
|
||||||
|
String pubDate = "";
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "dc:title")) {
|
||||||
|
if (!title.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
title = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||||
|
if (!description.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
description = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||||
|
if (!pubDate.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
pubDate = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("link", "url")) {
|
||||||
|
if (!link.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
link = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||||
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() {
|
||||||
|
@@ -1,50 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
public class TestXmlSanitization {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPreservedEntities() {
|
|
||||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
|
||||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
|
||||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
|
||||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testNlnetTitleTag() {
|
|
||||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
|
||||||
|
|
||||||
// Verify we're able to consume and strip out the HTML tags
|
|
||||||
RssReader r = new RssReader();
|
|
||||||
|
|
||||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, items.size());
|
|
||||||
for (var item : items) {
|
|
||||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testStrayAmpersand() {
|
|
||||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntity() {
|
|
||||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntityQuot() {
|
|
||||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
|
||||||
}
|
|
||||||
}
|
|
@@ -26,6 +26,8 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
@@ -69,6 +71,8 @@ public class SearchSiteInfoService {
|
|||||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private volatile SiteOverviewModel model = new SiteOverviewModel(List.of(), Instant.EPOCH);
|
||||||
|
|
||||||
@GET
|
@GET
|
||||||
@Path("/site")
|
@Path("/site")
|
||||||
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
||||||
@@ -77,6 +81,27 @@ public class SearchSiteInfoService {
|
|||||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (model.age().compareTo(Duration.ofMinutes(15)) > 0) {
|
||||||
|
updateModel();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new MapModelAndView("siteinfo/start.jte",
|
||||||
|
Map.of("navbar", NavbarModel.SITEINFO,
|
||||||
|
"model", model));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Update the model if it is older than 15 minutes.
|
||||||
|
* This query is expensive and should not be run too often,
|
||||||
|
* and the data doesn't change that often either.
|
||||||
|
* <p></p>
|
||||||
|
* This method is synchronized to avoid multiple threads updating the model at the same time.
|
||||||
|
*/
|
||||||
|
private synchronized void updateModel() {
|
||||||
|
var currentModel = model;
|
||||||
|
if (currentModel.age().compareTo(Duration.ofMinutes(15)) < 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
@@ -91,13 +116,20 @@ public class SearchSiteInfoService {
|
|||||||
throw new RuntimeException();
|
throw new RuntimeException();
|
||||||
}
|
}
|
||||||
|
|
||||||
return new MapModelAndView("siteinfo/start.jte",
|
model = new SiteOverviewModel(domains);
|
||||||
Map.of("navbar", NavbarModel.SITEINFO,
|
}
|
||||||
"model", new SiteOverviewModel(domains)));
|
|
||||||
|
public record SiteOverviewModel(List<DiscoveredDomain> domains, Instant captureTime) {
|
||||||
|
|
||||||
|
public SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||||
|
this(domains, Instant.now());
|
||||||
}
|
}
|
||||||
|
|
||||||
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
|
||||||
public record DiscoveredDomain(String name, String timestamp) {}
|
public record DiscoveredDomain(String name, String timestamp) {}
|
||||||
|
|
||||||
|
public Duration age() {
|
||||||
|
return Duration.between(captureTime, Instant.now());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@GET
|
@GET
|
||||||
|
@@ -81,35 +81,6 @@
|
|||||||
|
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
|
|
||||||
@if (!siteInfo.siblingDomains().isEmpty())
|
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
|
||||||
<i class="fas fa-globe"></i>
|
|
||||||
<span>Related Subdomains</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
|
||||||
<thead>
|
|
||||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
|
||||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
|
||||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
|
||||||
<tr>
|
|
||||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
|
||||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
|
||||||
|
|
||||||
@if (!sibling.isIndexed())
|
|
||||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
|
||||||
@endif
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
@endfor
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
@endif
|
|
||||||
|
|
||||||
@if (siteInfo.domainInformation().isUnknownDomain())
|
@if (siteInfo.domainInformation().isUnknownDomain())
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
<i class="fa-regular fa-circle-question"></i>
|
<i class="fa-regular fa-circle-question"></i>
|
||||||
@@ -178,6 +149,36 @@
|
|||||||
</form>
|
</form>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
|
|
||||||
|
@if (!siteInfo.siblingDomains().isEmpty())
|
||||||
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
|
<i class="fas fa-globe"></i>
|
||||||
|
<span>Related Subdomains</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||||
|
<thead>
|
||||||
|
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||||
|
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||||
|
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||||
|
<tr>
|
||||||
|
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||||
|
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||||
|
|
||||||
|
@if (!sibling.isIndexed())
|
||||||
|
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||||
|
@endif
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
@endfor
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
@endif
|
||||||
|
|
||||||
|
|
||||||
@if (siteInfo.isKnown())
|
@if (siteInfo.isKnown())
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
<i class="fas fa-chart-simple"></i>
|
<i class="fas fa-chart-simple"></i>
|
||||||
|
Reference in New Issue
Block a user