mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
10 Commits
deploy-004
...
deploy-005
Author | SHA1 | Date | |
---|---|---|---|
|
bc2c2061f2 | ||
|
1c7f5a31a5 | ||
|
59a8ea60f7 | ||
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f |
@@ -20,7 +20,10 @@ public class DbDomainQueries {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@Inject
|
||||
public DbDomainQueries(HikariDataSource dataSource)
|
||||
@@ -30,16 +33,21 @@ public class DbDomainQueries {
|
||||
|
||||
|
||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try {
|
||||
return domainIdCache.get(domain, () -> {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return rsp.getInt(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
@@ -49,9 +57,6 @@ public class DbDomainQueries {
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
@@ -84,31 +89,38 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
EdgeDomain existing = domainNameCache.getIfPresent(id);
|
||||
if (existing != null) {
|
||||
return Optional.of(existing);
|
||||
}
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||
var val = new EdgeDomain(rsp.getString(1));
|
||||
domainNameCache.put(id, val);
|
||||
return Optional.of(val);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) {
|
||||
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
|
||||
String topDomain = domain.topDomain;
|
||||
|
||||
return siblingsCache.get(topDomain, () -> {
|
||||
List<DomainWithNode> ret = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
|
||||
stmt.setString(1, domain.topDomain);
|
||||
stmt.setString(1, topDomain);
|
||||
stmt.setInt(2, cnt);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
@@ -123,8 +135,9 @@ public class DbDomainQueries {
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to get domain neighbors");
|
||||
}
|
||||
|
||||
return ret;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {
|
||||
|
@@ -1,118 +0,0 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
/** Class used in exporting data. This is intended to be used for a brief time
|
||||
* and then discarded, not kept around as a service.
|
||||
*/
|
||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
||||
private final Connection connection;
|
||||
private final int nodeId;
|
||||
private final PreparedStatement knownUrlsQuery;
|
||||
private final PreparedStatement visitedUrlsQuery;
|
||||
private final PreparedStatement goodUrlsQuery;
|
||||
private final PreparedStatement domainNameToId;
|
||||
|
||||
private final PreparedStatement allDomainsQuery;
|
||||
private final PreparedStatement crawlQueueDomains;
|
||||
private final PreparedStatement indexedDomainsQuery;
|
||||
|
||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
||||
this.connection = dataSource.getConnection();
|
||||
this.nodeId = nodeId;
|
||||
|
||||
knownUrlsQuery = connection.prepareStatement("""
|
||||
SELECT KNOWN_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
visitedUrlsQuery = connection.prepareStatement("""
|
||||
SELECT VISITED_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
goodUrlsQuery = connection.prepareStatement("""
|
||||
SELECT GOOD_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
domainNameToId = connection.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
allDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
""");
|
||||
crawlQueueDomains = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM CRAWL_QUEUE
|
||||
""");
|
||||
indexedDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE INDEXED > 0
|
||||
""");
|
||||
}
|
||||
|
||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
||||
}
|
||||
|
||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, domainNameToId);
|
||||
}
|
||||
|
||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
||||
return executeListQuery(crawlQueueDomains, 100);
|
||||
}
|
||||
public List<String> getAllIndexedDomains() throws SQLException {
|
||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
||||
}
|
||||
|
||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
||||
throws SQLException {
|
||||
statement.setString(1, domainName);
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return OptionalInt.of(rs.getInt(1));
|
||||
}
|
||||
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
|
||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
||||
List<String> ret = new ArrayList<>(sizeHint);
|
||||
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
knownUrlsQuery.close();
|
||||
goodUrlsQuery.close();
|
||||
visitedUrlsQuery.close();
|
||||
allDomainsQuery.close();
|
||||
crawlQueueDomains.close();
|
||||
domainNameToId.close();
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||
|
||||
config.setMaximumPoolSize(5);
|
||||
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||
config.setMinimumIdle(2);
|
||||
|
||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||
|
@@ -15,7 +15,9 @@ import java.util.Map;
|
||||
|
||||
/** Client for local browserless.io API */
|
||||
public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||
|
||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content"))
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/screenshot"))
|
||||
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.rss.model;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
||||
public static final int MAX_DESC_LENGTH = 255;
|
||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||
|
||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
||||
String title = item.getTitle().orElse("");
|
||||
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||
String title = item.title();
|
||||
String date = getItemDate(item);
|
||||
String description = getItemDescription(item);
|
||||
String url;
|
||||
|
||||
if (keepFragment || item.getLink().isEmpty()) {
|
||||
url = item.getLink().orElse("");
|
||||
if (keepFragment) {
|
||||
url = item.url();
|
||||
}
|
||||
else {
|
||||
try {
|
||||
String link = item.getLink().get();
|
||||
String link = item.url();
|
||||
var linkUri = new URI(link);
|
||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||
url = cleanUri.toString();
|
||||
}
|
||||
catch (Exception e) {
|
||||
// fallback to original link if we can't clean it, this is not a very important step
|
||||
url = item.getLink().get();
|
||||
url = item.url();
|
||||
}
|
||||
}
|
||||
|
||||
return new FeedItem(title, date, description, url);
|
||||
}
|
||||
|
||||
private static String getItemDescription(Item item) {
|
||||
Optional<String> description = item.getDescription();
|
||||
if (description.isEmpty())
|
||||
return "";
|
||||
|
||||
String rawDescription = description.get();
|
||||
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||
String rawDescription = item.description();
|
||||
if (rawDescription.indexOf('<') >= 0) {
|
||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||
}
|
||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
||||
|
||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||
private static String getItemDate(Item item) {
|
||||
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||
try {
|
||||
zonedDateTime = item.getPubDateZonedDateTime();
|
||||
}
|
||||
catch (Exception e) {
|
||||
zonedDateTime = item.getPubDate()
|
||||
.map(extraFormatter::parse)
|
||||
.map(ZonedDateTime::from);
|
||||
try {
|
||||
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||
}
|
||||
catch (Exception e2) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||
|
@@ -1,7 +1,5 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import com.google.inject.Inject;
|
||||
import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
@@ -48,8 +44,6 @@ public class FeedFetcherService {
|
||||
private static final int MAX_FEED_ITEMS = 10;
|
||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||
|
||||
private final RssReader rssReader = new RssReader();
|
||||
|
||||
private final FeedDb feedDb;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
@@ -72,17 +66,6 @@ public class FeedFetcherService {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
|
||||
|
||||
// Add support for some alternate date tags for atom
|
||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
||||
rssReader.addItemExtension("created", this::setDateFallback);
|
||||
}
|
||||
|
||||
private void setDateFallback(Item item, String value) {
|
||||
if (item.getPubDate().isEmpty()) {
|
||||
item.setPubDate(value);
|
||||
}
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -371,12 +354,7 @@ public class FeedFetcherService {
|
||||
|
||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||
try {
|
||||
feedData = sanitizeEntities(feedData);
|
||||
|
||||
List<Item> rawItems = rssReader.read(
|
||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||
).toList();
|
||||
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||
|
||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||
|
||||
@@ -399,33 +377,6 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||
"»", "»",
|
||||
"«", "«",
|
||||
"—", "--",
|
||||
"–", "-",
|
||||
"’", "'",
|
||||
"‘", "'",
|
||||
""", "\"",
|
||||
" ", ""
|
||||
);
|
||||
|
||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||
* which is unfortunately relatively common. Replace them as far as is possible
|
||||
* with their corresponding characters
|
||||
*/
|
||||
static String sanitizeEntities(String feedData) {
|
||||
String result = feedData;
|
||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||
result = result.replace(entry.getKey(), entry.getValue());
|
||||
}
|
||||
|
||||
// Handle lone ampersands not part of a recognized XML entity
|
||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Decide whether to keep URI fragments in the feed items.
|
||||
* <p></p>
|
||||
* We keep fragments if there are multiple different fragments in the items.
|
||||
@@ -433,16 +384,16 @@ public class FeedFetcherService {
|
||||
* @param items The items to check
|
||||
* @return True if we should keep the fragments, false otherwise
|
||||
*/
|
||||
private boolean areFragmentsDisparate(List<Item> items) {
|
||||
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||
Set<String> seenFragments = new HashSet<>();
|
||||
|
||||
try {
|
||||
for (var item : items) {
|
||||
if (item.getLink().isEmpty()) {
|
||||
if (item.url().isBlank()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var link = item.getLink().get();
|
||||
var link = item.url();
|
||||
if (!link.contains("#")) {
|
||||
continue;
|
||||
}
|
||||
|
@@ -10,6 +10,7 @@ import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||
*/
|
||||
@@ -59,6 +60,17 @@ public interface FeedJournal extends AutoCloseable {
|
||||
urlWriter.put(url);
|
||||
contentsWriter.put(contents);
|
||||
}
|
||||
}
|
||||
|
||||
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||
try (SlopTable table = new SlopTable(journalPath)) {
|
||||
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||
|
||||
while (urlReader.hasRemaining()) {
|
||||
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,94 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||
import com.apptasticsoftware.rssreader.util.Default;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.parser.Parser;
|
||||
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SimpleFeedParser {
|
||||
|
||||
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||
|
||||
public record ItemData (
|
||||
String title,
|
||||
String description,
|
||||
String url,
|
||||
String pubDate
|
||||
) {
|
||||
public boolean isWellFormed() {
|
||||
return title != null && !title.isBlank() &&
|
||||
description != null && !description.isBlank() &&
|
||||
url != null && !url.isBlank() &&
|
||||
pubDate != null && !pubDate.isBlank();
|
||||
}
|
||||
|
||||
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||
try {
|
||||
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static List<ItemData> parse(String content) {
|
||||
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||
List<ItemData> ret = new ArrayList<>();
|
||||
|
||||
doc.select("item, entry").forEach(element -> {
|
||||
String link = "";
|
||||
String title = "";
|
||||
String description = "";
|
||||
String pubDate = "";
|
||||
|
||||
for (String attr : List.of("title", "dc:title")) {
|
||||
if (!title.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
title = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||
if (!description.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
description = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||
if (!pubDate.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
pubDate = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
for (String attr : List.of("link", "url")) {
|
||||
if (!link.isBlank())
|
||||
break;
|
||||
var tag = element.getElementsByTag(attr).first();
|
||||
if (tag != null) {
|
||||
link = tag.text();
|
||||
}
|
||||
}
|
||||
|
||||
ret.add(new ItemData(title, description, link, pubDate));
|
||||
});
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.Map;
|
||||
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withExposedPorts(3000);
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
|
@@ -1,50 +0,0 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import com.apptasticsoftware.rssreader.Item;
|
||||
import com.apptasticsoftware.rssreader.RssReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestXmlSanitization {
|
||||
|
||||
@Test
|
||||
public void testPreservedEntities() {
|
||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNlnetTitleTag() {
|
||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
||||
|
||||
// Verify we're able to consume and strip out the HTML tags
|
||||
RssReader r = new RssReader();
|
||||
|
||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
||||
|
||||
Assertions.assertEquals(1, items.size());
|
||||
for (var item : items) {
|
||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrayAmpersand() {
|
||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntity() {
|
||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTranslatedHtmlEntityQuot() {
|
||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
||||
}
|
||||
}
|
@@ -16,20 +16,18 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import static java.lang.Math.clamp;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@Singleton
|
||||
public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
@@ -51,40 +49,31 @@ public class IndexClient {
|
||||
|
||||
/** Execute a query on the index partitions and return the combined results. */
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
.async(executor)
|
||||
.runEach(indexRequest);
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
|
||||
|
||||
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
for (var future : futures) {
|
||||
try {
|
||||
future.get().forEachRemaining(results::add);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Downstream exception", e);
|
||||
}
|
||||
}
|
||||
List<RpcDecoratedResultItem> results =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
.async(executor)
|
||||
.runEach(indexRequest)
|
||||
.stream()
|
||||
.map(future -> future.thenApply(iterator -> {
|
||||
List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
|
||||
iterator.forEachRemaining(ret::add);
|
||||
totalNumResults.addAndGet(ret.size());
|
||||
return ret;
|
||||
}))
|
||||
.map(CompletableFuture::join)
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
.toList();
|
||||
|
||||
// Sort the results by ranking score and remove blacklisted domains
|
||||
results.sort(comparator);
|
||||
results.removeIf(this::isBlacklisted);
|
||||
|
||||
int numReceivedResults = results.size();
|
||||
|
||||
// pagination is typically 1-indexed, so we need to adjust the start and end indices
|
||||
int indexStart = (pagination.page - 1) * pagination.pageSize;
|
||||
int indexEnd = (pagination.page) * pagination.pageSize;
|
||||
|
||||
results = results.subList(
|
||||
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
|
||||
clamp(indexEnd, 0, results.size()));
|
||||
|
||||
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
|
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
@@ -67,8 +68,12 @@ public class SearchSiteInfoService {
|
||||
this.screenshotService = screenshotService;
|
||||
this.dataSource = dataSource;
|
||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||
|
||||
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
|
||||
}
|
||||
|
||||
private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
|
||||
|
||||
@GET
|
||||
@Path("/site")
|
||||
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
||||
@@ -77,23 +82,48 @@ public class SearchSiteInfoService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
||||
}
|
||||
|
||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
return new MapModelAndView("siteinfo/start.jte",
|
||||
Map.of("navbar", NavbarModel.SITEINFO,
|
||||
"model", new SiteOverviewModel(domains)));
|
||||
"model", cachedOverviewModel));
|
||||
}
|
||||
|
||||
private void modelUpdater() {
|
||||
while (!Thread.interrupted()) {
|
||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||
|
||||
// This query can be quite expensive, so we can't run it on demand
|
||||
// for every request. Instead, we run it every 15 minutes and cache
|
||||
// the result.
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME, DISCOVER_DATE
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY = 0
|
||||
ORDER BY ID DESC
|
||||
LIMIT 10
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(new SiteOverviewModel.DiscoveredDomain(
|
||||
rs.getString("DOMAIN_NAME"),
|
||||
rs.getString("DISCOVER_DATE"))
|
||||
);
|
||||
}
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("Failed to get recently added domains: {}", ex.getMessage());
|
||||
}
|
||||
|
||||
cachedOverviewModel = new SiteOverviewModel(domains);
|
||||
|
||||
try {
|
||||
TimeUnit.MINUTES.sleep(15);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||
@@ -107,7 +137,7 @@ public class SearchSiteInfoService {
|
||||
@PathParam String domainName,
|
||||
@QueryParam String view,
|
||||
@QueryParam Integer page
|
||||
) throws SQLException {
|
||||
) throws SQLException, ExecutionException {
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
@@ -193,7 +223,7 @@ public class SearchSiteInfoService {
|
||||
);
|
||||
}
|
||||
|
||||
private SiteInfoWithContext listInfo(Context context, String domainName) {
|
||||
private SiteInfoWithContext listInfo(Context context, String domainName) throws ExecutionException {
|
||||
|
||||
var domain = new EdgeDomain(domainName);
|
||||
final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);
|
||||
|
@@ -1,5 +1,4 @@
|
||||
@import nu.marginalia.db.DbDomainQueries
|
||||
@import nu.marginalia.model.EdgeDomain
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||
@import nu.marginalia.search.model.UrlDetails
|
||||
@@ -81,35 +80,6 @@
|
||||
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
<span>Related Subdomains</span>
|
||||
</div>
|
||||
|
||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||
<thead>
|
||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||
<tr>
|
||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||
|
||||
@if (!sibling.isIndexed())
|
||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||
@endif
|
||||
</td>
|
||||
</tr>
|
||||
@endfor
|
||||
</tbody>
|
||||
</table>
|
||||
@endif
|
||||
|
||||
@if (siteInfo.domainInformation().isUnknownDomain())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fa-regular fa-circle-question"></i>
|
||||
@@ -178,6 +148,36 @@
|
||||
</form>
|
||||
@endif
|
||||
|
||||
|
||||
@if (!siteInfo.siblingDomains().isEmpty())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-globe"></i>
|
||||
<span>Related Subdomains</span>
|
||||
</div>
|
||||
|
||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||
<thead>
|
||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||
<tr>
|
||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||
|
||||
@if (!sibling.isIndexed())
|
||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||
@endif
|
||||
</td>
|
||||
</tr>
|
||||
@endfor
|
||||
</tbody>
|
||||
</table>
|
||||
@endif
|
||||
|
||||
|
||||
@if (siteInfo.isKnown())
|
||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||
<i class="fas fa-chart-simple"></i>
|
||||
|
@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
|
||||
public class ScreenshotCaptureToolMain {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
||||
|
||||
private static final String BROWSERLESS_TOKEN = System.getenv("live-capture.browserless-token");
|
||||
public static void main(String[] args) {
|
||||
DatabaseModule databaseModule = new DatabaseModule(false);
|
||||
var ds = databaseModule.provideConnection();
|
||||
@@ -107,7 +107,7 @@ public class ScreenshotCaptureToolMain {
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(new URI("http://browserless:3000/screenshot"))
|
||||
.uri(new URI("http://browserless:3000/screenshot?token=" + BROWSERLESS_TOKEN))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
|
Reference in New Issue
Block a user