mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
12 Commits
deploy-004
...
deploy-005
Author | SHA1 | Date | |
---|---|---|---|
|
aa9b1244ea | ||
|
2d17233366 | ||
|
b245cc9f38 | ||
|
6614d05bdf | ||
|
55aeb03c4a | ||
|
faa589962f | ||
|
c7edd6b39f | ||
|
79da622e3b | ||
|
3da8337ba6 | ||
|
a32d230f0a | ||
|
3772bfd387 | ||
|
02a7900d1a |
@@ -1,118 +0,0 @@
|
|||||||
package nu.marginalia.db;
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
|
|
||||||
import java.sql.Connection;
|
|
||||||
import java.sql.PreparedStatement;
|
|
||||||
import java.sql.SQLException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.OptionalInt;
|
|
||||||
|
|
||||||
/** Class used in exporting data. This is intended to be used for a brief time
|
|
||||||
* and then discarded, not kept around as a service.
|
|
||||||
*/
|
|
||||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
|
||||||
private final Connection connection;
|
|
||||||
private final int nodeId;
|
|
||||||
private final PreparedStatement knownUrlsQuery;
|
|
||||||
private final PreparedStatement visitedUrlsQuery;
|
|
||||||
private final PreparedStatement goodUrlsQuery;
|
|
||||||
private final PreparedStatement domainNameToId;
|
|
||||||
|
|
||||||
private final PreparedStatement allDomainsQuery;
|
|
||||||
private final PreparedStatement crawlQueueDomains;
|
|
||||||
private final PreparedStatement indexedDomainsQuery;
|
|
||||||
|
|
||||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
|
||||||
this.connection = dataSource.getConnection();
|
|
||||||
this.nodeId = nodeId;
|
|
||||||
|
|
||||||
knownUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT KNOWN_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
visitedUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT VISITED_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
goodUrlsQuery = connection.prepareStatement("""
|
|
||||||
SELECT GOOD_URLS
|
|
||||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
|
||||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
domainNameToId = connection.prepareStatement("""
|
|
||||||
SELECT ID
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE DOMAIN_NAME=?
|
|
||||||
""");
|
|
||||||
allDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
""");
|
|
||||||
crawlQueueDomains = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM CRAWL_QUEUE
|
|
||||||
""");
|
|
||||||
indexedDomainsQuery = connection.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
WHERE INDEXED > 0
|
|
||||||
""");
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
|
||||||
return executeNameToIntQuery(domainName, domainNameToId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
|
||||||
return executeListQuery(crawlQueueDomains, 100);
|
|
||||||
}
|
|
||||||
public List<String> getAllIndexedDomains() throws SQLException {
|
|
||||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
|
||||||
throws SQLException {
|
|
||||||
statement.setString(1, domainName);
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
if (rs.next()) {
|
|
||||||
return OptionalInt.of(rs.getInt(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return OptionalInt.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
|
||||||
List<String> ret = new ArrayList<>(sizeHint);
|
|
||||||
|
|
||||||
var rs = statement.executeQuery();
|
|
||||||
|
|
||||||
while (rs.next()) {
|
|
||||||
ret.add(rs.getString(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws SQLException {
|
|
||||||
knownUrlsQuery.close();
|
|
||||||
goodUrlsQuery.close();
|
|
||||||
visitedUrlsQuery.close();
|
|
||||||
allDomainsQuery.close();
|
|
||||||
crawlQueueDomains.close();
|
|
||||||
domainNameToId.close();
|
|
||||||
connection.close();
|
|
||||||
}
|
|
||||||
}
|
|
@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||||
|
|
||||||
config.setMaximumPoolSize(5);
|
config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
|
||||||
config.setMinimumIdle(2);
|
config.setMinimumIdle(2);
|
||||||
|
|
||||||
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
|
||||||
|
@@ -29,6 +29,7 @@ dependencies {
|
|||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation project(':third-party:rssreader')
|
implementation project(':third-party:rssreader')
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
|
implementation libs.slop
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
@@ -15,7 +15,9 @@ import java.util.Map;
|
|||||||
|
|
||||||
/** Client for local browserless.io API */
|
/** Client for local browserless.io API */
|
||||||
public class BrowserlessClient implements AutoCloseable {
|
public class BrowserlessClient implements AutoCloseable {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||||
|
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
|
||||||
|
|
||||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||||
.version(HttpClient.Version.HTTP_1_1)
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/content"))
|
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(browserlessURI.resolve("/screenshot"))
|
.uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.rss.model;
|
package nu.marginalia.rss.model;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
import nu.marginalia.rss.svc.SimpleFeedParser;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@@ -18,37 +18,33 @@ public record FeedItem(String title,
|
|||||||
public static final int MAX_DESC_LENGTH = 255;
|
public static final int MAX_DESC_LENGTH = 255;
|
||||||
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
|
||||||
|
|
||||||
public static FeedItem fromItem(Item item, boolean keepFragment) {
|
public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
|
||||||
String title = item.getTitle().orElse("");
|
String title = item.title();
|
||||||
String date = getItemDate(item);
|
String date = getItemDate(item);
|
||||||
String description = getItemDescription(item);
|
String description = getItemDescription(item);
|
||||||
String url;
|
String url;
|
||||||
|
|
||||||
if (keepFragment || item.getLink().isEmpty()) {
|
if (keepFragment) {
|
||||||
url = item.getLink().orElse("");
|
url = item.url();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try {
|
try {
|
||||||
String link = item.getLink().get();
|
String link = item.url();
|
||||||
var linkUri = new URI(link);
|
var linkUri = new URI(link);
|
||||||
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
|
||||||
url = cleanUri.toString();
|
url = cleanUri.toString();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
// fallback to original link if we can't clean it, this is not a very important step
|
// fallback to original link if we can't clean it, this is not a very important step
|
||||||
url = item.getLink().get();
|
url = item.url();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FeedItem(title, date, description, url);
|
return new FeedItem(title, date, description, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getItemDescription(Item item) {
|
private static String getItemDescription(SimpleFeedParser.ItemData item) {
|
||||||
Optional<String> description = item.getDescription();
|
String rawDescription = item.description();
|
||||||
if (description.isEmpty())
|
|
||||||
return "";
|
|
||||||
|
|
||||||
String rawDescription = description.get();
|
|
||||||
if (rawDescription.indexOf('<') >= 0) {
|
if (rawDescription.indexOf('<') >= 0) {
|
||||||
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
|
||||||
}
|
}
|
||||||
@@ -58,15 +54,18 @@ public record FeedItem(String title,
|
|||||||
|
|
||||||
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
|
||||||
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
|
||||||
private static String getItemDate(Item item) {
|
private static String getItemDate(SimpleFeedParser.ItemData item) {
|
||||||
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
Optional<ZonedDateTime> zonedDateTime = Optional.empty();
|
||||||
try {
|
try {
|
||||||
zonedDateTime = item.getPubDateZonedDateTime();
|
zonedDateTime = item.getPubDateZonedDateTime();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
zonedDateTime = item.getPubDate()
|
try {
|
||||||
.map(extraFormatter::parse)
|
zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
|
||||||
.map(ZonedDateTime::from);
|
}
|
||||||
|
catch (Exception e2) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");
|
||||||
|
@@ -1,7 +1,5 @@
|
|||||||
package nu.marginalia.rss.svc;
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
|
|||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
import java.net.http.HttpRequest;
|
import java.net.http.HttpRequest;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
@@ -48,8 +44,6 @@ public class FeedFetcherService {
|
|||||||
private static final int MAX_FEED_ITEMS = 10;
|
private static final int MAX_FEED_ITEMS = 10;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
|
||||||
|
|
||||||
private final RssReader rssReader = new RssReader();
|
|
||||||
|
|
||||||
private final FeedDb feedDb;
|
private final FeedDb feedDb;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
@@ -72,17 +66,6 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
|
||||||
|
|
||||||
// Add support for some alternate date tags for atom
|
|
||||||
rssReader.addItemExtension("issued", this::setDateFallback);
|
|
||||||
rssReader.addItemExtension("created", this::setDateFallback);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setDateFallback(Item item, String value) {
|
|
||||||
if (item.getPubDate().isEmpty()) {
|
|
||||||
item.setPubDate(value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
@@ -96,6 +79,7 @@ public class FeedFetcherService {
|
|||||||
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
throw new IllegalStateException("Already updating feeds, refusing to start another update");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try (FeedDbWriter writer = feedDb.createWriter();
|
try (FeedDbWriter writer = feedDb.createWriter();
|
||||||
HttpClient client = HttpClient.newBuilder()
|
HttpClient client = HttpClient.newBuilder()
|
||||||
.connectTimeout(Duration.ofSeconds(15))
|
.connectTimeout(Duration.ofSeconds(15))
|
||||||
@@ -103,6 +87,7 @@ public class FeedFetcherService {
|
|||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
.version(HttpClient.Version.HTTP_2)
|
.version(HttpClient.Version.HTTP_2)
|
||||||
.build();
|
.build();
|
||||||
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
updating = true;
|
updating = true;
|
||||||
@@ -155,6 +140,8 @@ public class FeedFetcherService {
|
|||||||
case FetchResult.Success(String value, String etag) -> {
|
case FetchResult.Success(String value, String etag) -> {
|
||||||
writer.saveEtag(feed.domain(), etag);
|
writer.saveEtag(feed.domain(), etag);
|
||||||
writer.saveFeed(parseFeed(value, feed));
|
writer.saveFeed(parseFeed(value, feed));
|
||||||
|
|
||||||
|
feedJournal.record(feed.feedUrl(), value);
|
||||||
}
|
}
|
||||||
case FetchResult.NotModified() -> {
|
case FetchResult.NotModified() -> {
|
||||||
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||||
@@ -367,12 +354,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
feedData = sanitizeEntities(feedData);
|
List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
|
||||||
).toList();
|
|
||||||
|
|
||||||
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
|
||||||
|
|
||||||
@@ -395,33 +377,6 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
|
||||||
"»", "»",
|
|
||||||
"«", "«",
|
|
||||||
"—", "--",
|
|
||||||
"–", "-",
|
|
||||||
"’", "'",
|
|
||||||
"‘", "'",
|
|
||||||
""", "\"",
|
|
||||||
" ", ""
|
|
||||||
);
|
|
||||||
|
|
||||||
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
|
||||||
* which is unfortunately relatively common. Replace them as far as is possible
|
|
||||||
* with their corresponding characters
|
|
||||||
*/
|
|
||||||
static String sanitizeEntities(String feedData) {
|
|
||||||
String result = feedData;
|
|
||||||
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
|
||||||
result = result.replace(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle lone ampersands not part of a recognized XML entity
|
|
||||||
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
@@ -429,16 +384,16 @@ public class FeedFetcherService {
|
|||||||
* @param items The items to check
|
* @param items The items to check
|
||||||
* @return True if we should keep the fragments, false otherwise
|
* @return True if we should keep the fragments, false otherwise
|
||||||
*/
|
*/
|
||||||
private boolean areFragmentsDisparate(List<Item> items) {
|
private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
|
||||||
Set<String> seenFragments = new HashSet<>();
|
Set<String> seenFragments = new HashSet<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (var item : items) {
|
for (var item : items) {
|
||||||
if (item.getLink().isEmpty()) {
|
if (item.url().isBlank()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var link = item.getLink().get();
|
var link = item.url();
|
||||||
if (!link.contains("#")) {
|
if (!link.contains("#")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,76 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.slop.SlopTable;
|
||||||
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
|
||||||
|
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
|
||||||
|
*/
|
||||||
|
public interface FeedJournal extends AutoCloseable {
|
||||||
|
StringColumn urlColumn = new StringColumn("url");
|
||||||
|
StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
|
|
||||||
|
void record(String url, String contents) throws IOException;
|
||||||
|
void close() throws IOException;
|
||||||
|
|
||||||
|
|
||||||
|
static FeedJournal create() throws IOException {
|
||||||
|
if (Boolean.getBoolean("feedFetcher.persistJournal")) {
|
||||||
|
Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
|
||||||
|
if (Files.isDirectory(journalPath)) {
|
||||||
|
FileUtils.deleteDirectory(journalPath.toFile());
|
||||||
|
}
|
||||||
|
Files.createDirectories(journalPath);
|
||||||
|
return new RecordingFeedJournal(journalPath);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new NoOpFeedJournal();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class NoOpFeedJournal implements FeedJournal {
|
||||||
|
@Override
|
||||||
|
public void record(String url, String contents) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
class RecordingFeedJournal extends SlopTable implements FeedJournal {
|
||||||
|
|
||||||
|
private final StringColumn.Writer urlWriter;
|
||||||
|
private final StringColumn.Writer contentsWriter;
|
||||||
|
|
||||||
|
public RecordingFeedJournal(Path path) throws IOException {
|
||||||
|
super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
|
||||||
|
|
||||||
|
urlWriter = urlColumn.create(this);
|
||||||
|
contentsWriter = contentsColumn.create(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void record(String url, String contents) throws IOException {
|
||||||
|
urlWriter.put(url);
|
||||||
|
contentsWriter.put(contents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
|
||||||
|
try (SlopTable table = new SlopTable(journalPath)) {
|
||||||
|
final StringColumn.Reader urlReader = urlColumn.open(table);
|
||||||
|
final StringColumn.Reader contentsReader = contentsColumn.open(table);
|
||||||
|
|
||||||
|
while (urlReader.hasRemaining()) {
|
||||||
|
urlAndContent.accept(urlReader.get(), contentsReader.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,94 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import com.apptasticsoftware.rssreader.DateTimeParser;
|
||||||
|
import com.apptasticsoftware.rssreader.util.Default;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.parser.Parser;
|
||||||
|
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class SimpleFeedParser {
|
||||||
|
|
||||||
|
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
|
||||||
|
|
||||||
|
public record ItemData (
|
||||||
|
String title,
|
||||||
|
String description,
|
||||||
|
String url,
|
||||||
|
String pubDate
|
||||||
|
) {
|
||||||
|
public boolean isWellFormed() {
|
||||||
|
return title != null && !title.isBlank() &&
|
||||||
|
description != null && !description.isBlank() &&
|
||||||
|
url != null && !url.isBlank() &&
|
||||||
|
pubDate != null && !pubDate.isBlank();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
|
||||||
|
try {
|
||||||
|
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<ItemData> parse(String content) {
|
||||||
|
var doc = Jsoup.parse(content, Parser.xmlParser());
|
||||||
|
List<ItemData> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
doc.select("item, entry").forEach(element -> {
|
||||||
|
String link = "";
|
||||||
|
String title = "";
|
||||||
|
String description = "";
|
||||||
|
String pubDate = "";
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "dc:title")) {
|
||||||
|
if (!title.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
title = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
|
||||||
|
if (!description.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
description = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
|
||||||
|
if (!pubDate.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
pubDate = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String attr : List.of("link", "url")) {
|
||||||
|
if (!link.isBlank())
|
||||||
|
break;
|
||||||
|
var tag = element.getElementsByTag(attr).first();
|
||||||
|
if (tag != null) {
|
||||||
|
link = tag.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new ItemData(title, description, link, pubDate));
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||||
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() {
|
||||||
|
@@ -1,50 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import com.apptasticsoftware.rssreader.Item;
|
|
||||||
import com.apptasticsoftware.rssreader.RssReader;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
public class TestXmlSanitization {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPreservedEntities() {
|
|
||||||
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
|
||||||
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
|
||||||
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
|
||||||
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testNlnetTitleTag() {
|
|
||||||
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
|
|
||||||
|
|
||||||
// Verify we're able to consume and strip out the HTML tags
|
|
||||||
RssReader r = new RssReader();
|
|
||||||
|
|
||||||
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
|
|
||||||
|
|
||||||
Assertions.assertEquals(1, items.size());
|
|
||||||
for (var item : items) {
|
|
||||||
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testStrayAmpersand() {
|
|
||||||
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntity() {
|
|
||||||
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testTranslatedHtmlEntityQuot() {
|
|
||||||
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob""));
|
|
||||||
}
|
|
||||||
}
|
|
@@ -15,7 +15,10 @@ import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class QueryProtobufCodec {
|
public class QueryProtobufCodec {
|
||||||
|
|
||||||
@@ -42,13 +45,27 @@ public class QueryProtobufCodec {
|
|||||||
else
|
else
|
||||||
builder.setQueryStrategy(request.getQueryStrategy());
|
builder.setQueryStrategy(request.getQueryStrategy());
|
||||||
|
|
||||||
if (query.specs.rankingParams != null && request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) {
|
if (request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) {
|
||||||
|
if (query.specs.rankingParams != null) {
|
||||||
builder.setParameters(
|
builder.setParameters(
|
||||||
RpcResultRankingParameters.newBuilder(query.specs.rankingParams)
|
RpcResultRankingParameters.newBuilder(query.specs.rankingParams)
|
||||||
.setTemporalBias(request.getTemporalBias())
|
.setTemporalBias(request.getTemporalBias())
|
||||||
.build()
|
.build()
|
||||||
);
|
);
|
||||||
|
} else {
|
||||||
|
builder.setParameters(
|
||||||
|
RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults())
|
||||||
|
.setTemporalBias(request.getTemporalBias())
|
||||||
|
.build()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
} else if (query.specs.rankingParams != null) {
|
||||||
|
builder.setParameters(query.specs.rankingParams);
|
||||||
|
}
|
||||||
|
// else {
|
||||||
|
// if we have no ranking params, we don't need to set them, the client check and use the default values
|
||||||
|
// so we don't need to send this huge object over the wire
|
||||||
|
// }
|
||||||
|
|
||||||
return builder.build();
|
return builder.build();
|
||||||
}
|
}
|
||||||
@@ -292,7 +309,7 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
IndexProtobufCodec.convertSpecLimit(specs.getRank()),
|
||||||
specs.getQueryLimits(),
|
specs.getQueryLimits(),
|
||||||
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
QueryStrategy.valueOf(specs.getQueryStrategy()),
|
||||||
Objects.requireNonNullElseGet(specs.getParameters(), PrototypeRankingParameters::sensibleDefaults)
|
specs.hasParameters() ? specs.getParameters() : null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -5,6 +5,7 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class SearchSpecification {
|
public class SearchSpecification {
|
||||||
@@ -28,6 +29,7 @@ public class SearchSpecification {
|
|||||||
|
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
public final RpcResultRankingParameters rankingParams;
|
public final RpcResultRankingParameters rankingParams;
|
||||||
|
|
||||||
public SearchSpecification(SearchQuery query,
|
public SearchSpecification(SearchQuery query,
|
||||||
@@ -40,7 +42,7 @@ public class SearchSpecification {
|
|||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
RpcQueryLimits queryLimits,
|
RpcQueryLimits queryLimits,
|
||||||
QueryStrategy queryStrategy,
|
QueryStrategy queryStrategy,
|
||||||
RpcResultRankingParameters rankingParams)
|
@Nullable RpcResultRankingParameters rankingParams)
|
||||||
{
|
{
|
||||||
this.query = query;
|
this.query = query;
|
||||||
this.domains = domains;
|
this.domains = domains;
|
||||||
|
@@ -13,8 +13,6 @@ import nu.marginalia.index.query.IndexSearchBudget;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
|
||||||
|
|
||||||
public class SearchParameters {
|
public class SearchParameters {
|
||||||
@@ -88,7 +86,7 @@ public class SearchParameters {
|
|||||||
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
|
||||||
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
|
||||||
|
|
||||||
rankingParams = Objects.requireNonNullElseGet(request.getParameters(), PrototypeRankingParameters::sensibleDefaults);
|
rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -37,7 +37,7 @@ public class SearchQueryService {
|
|||||||
@QueryParam String profile,
|
@QueryParam String profile,
|
||||||
@QueryParam String js,
|
@QueryParam String js,
|
||||||
@QueryParam String recent,
|
@QueryParam String recent,
|
||||||
@QueryParam String title,
|
@QueryParam String searchTitle,
|
||||||
@QueryParam String adtech,
|
@QueryParam String adtech,
|
||||||
@QueryParam Integer page
|
@QueryParam Integer page
|
||||||
) {
|
) {
|
||||||
@@ -47,7 +47,7 @@ public class SearchQueryService {
|
|||||||
SearchProfile.getSearchProfile(profile),
|
SearchProfile.getSearchProfile(profile),
|
||||||
SearchJsParameter.parse(js),
|
SearchJsParameter.parse(js),
|
||||||
SearchRecentParameter.parse(recent),
|
SearchRecentParameter.parse(recent),
|
||||||
SearchTitleParameter.parse(title),
|
SearchTitleParameter.parse(searchTitle),
|
||||||
SearchAdtechParameter.parse(adtech),
|
SearchAdtechParameter.parse(adtech),
|
||||||
false,
|
false,
|
||||||
Objects.requireNonNullElse(page,1));
|
Objects.requireNonNullElse(page,1));
|
||||||
|
@@ -67,8 +67,12 @@ public class SearchSiteInfoService {
|
|||||||
this.screenshotService = screenshotService;
|
this.screenshotService = screenshotService;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||||
|
|
||||||
|
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
|
||||||
|
|
||||||
@GET
|
@GET
|
||||||
@Path("/site")
|
@Path("/site")
|
||||||
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
public ModelAndView<?> handleOverview(@QueryParam String domain) {
|
||||||
@@ -77,23 +81,48 @@ public class SearchSiteInfoService {
|
|||||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
||||||
}
|
}
|
||||||
|
|
||||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
|
|
||||||
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
throw new RuntimeException();
|
|
||||||
}
|
|
||||||
|
|
||||||
return new MapModelAndView("siteinfo/start.jte",
|
return new MapModelAndView("siteinfo/start.jte",
|
||||||
Map.of("navbar", NavbarModel.SITEINFO,
|
Map.of("navbar", NavbarModel.SITEINFO,
|
||||||
"model", new SiteOverviewModel(domains)));
|
"model", cachedOverviewModel));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void modelUpdater() {
|
||||||
|
while (!Thread.interrupted()) {
|
||||||
|
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||||
|
|
||||||
|
// This query can be quite expensive, so we can't run it on demand
|
||||||
|
// for every request. Instead, we run it every 15 minutes and cache
|
||||||
|
// the result.
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME, DISCOVER_DATE
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
WHERE NODE_AFFINITY = 0
|
||||||
|
ORDER BY ID DESC
|
||||||
|
LIMIT 10
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
domains.add(new SiteOverviewModel.DiscoveredDomain(
|
||||||
|
rs.getString("DOMAIN_NAME"),
|
||||||
|
rs.getString("DISCOVER_DATE"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
logger.warn("Failed to get recently added domains: {}", ex.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
cachedOverviewModel = new SiteOverviewModel(domains);
|
||||||
|
|
||||||
|
try {
|
||||||
|
TimeUnit.MINUTES.sleep(15);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||||
|
@@ -34,12 +34,12 @@
|
|||||||
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
||||||
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||||
<div class="fas fa-wrench mr-1 text-margeblue dark:text-slate-200"></div>
|
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||||
This is the new design and home of Marginalia Search. Migration to the new domain <pre class="inline text-red-800 dark:text-red-100">marginalia-search.com</pre> is currently <em>in progress</em>,
|
This is the new design and home of Marginalia Search.
|
||||||
so mind that some things may be a bit broken for a day or two. <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">Read more</a>.
|
You can about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
<p class="my-4"></p>
|
<p class="my-4"></p>
|
||||||
If you have any issues or feedback regarding this change, please email
|
The old version of Marginalia Search remains available at
|
||||||
<a href="mailto:contact@marginalia-search.com" class="underline text-liteblue dark:text-blue-200">contact@marginalia-search.com</a>.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||||
|
@@ -1,5 +1,4 @@
|
|||||||
@import nu.marginalia.db.DbDomainQueries
|
@import nu.marginalia.db.DbDomainQueries
|
||||||
@import nu.marginalia.model.EdgeDomain
|
|
||||||
@import nu.marginalia.search.svc.SearchSiteInfoService
|
@import nu.marginalia.search.svc.SearchSiteInfoService
|
||||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||||
@import nu.marginalia.search.model.UrlDetails
|
@import nu.marginalia.search.model.UrlDetails
|
||||||
@@ -81,35 +80,6 @@
|
|||||||
|
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
|
|
||||||
@if (!siteInfo.siblingDomains().isEmpty())
|
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
|
||||||
<i class="fas fa-globe"></i>
|
|
||||||
<span>Related Subdomains</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
|
||||||
<thead>
|
|
||||||
<tr class="bg-gray-50 dark:bg-gray-700">
|
|
||||||
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
|
||||||
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
|
||||||
<tr>
|
|
||||||
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
|
||||||
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
|
||||||
|
|
||||||
@if (!sibling.isIndexed())
|
|
||||||
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
|
||||||
@endif
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
@endfor
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
@endif
|
|
||||||
|
|
||||||
@if (siteInfo.domainInformation().isUnknownDomain())
|
@if (siteInfo.domainInformation().isUnknownDomain())
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
<i class="fa-regular fa-circle-question"></i>
|
<i class="fa-regular fa-circle-question"></i>
|
||||||
@@ -178,6 +148,36 @@
|
|||||||
</form>
|
</form>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
|
|
||||||
|
@if (!siteInfo.siblingDomains().isEmpty())
|
||||||
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
|
<i class="fas fa-globe"></i>
|
||||||
|
<span>Related Subdomains</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
|
||||||
|
<thead>
|
||||||
|
<tr class="bg-gray-50 dark:bg-gray-700">
|
||||||
|
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
|
||||||
|
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
|
||||||
|
<tr>
|
||||||
|
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
|
||||||
|
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
|
||||||
|
|
||||||
|
@if (!sibling.isIndexed())
|
||||||
|
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
|
||||||
|
@endif
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
@endfor
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
@endif
|
||||||
|
|
||||||
|
|
||||||
@if (siteInfo.isKnown())
|
@if (siteInfo.isKnown())
|
||||||
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
|
||||||
<i class="fas fa-chart-simple"></i>
|
<i class="fas fa-chart-simple"></i>
|
||||||
|
@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
|
|||||||
public class ScreenshotCaptureToolMain {
|
public class ScreenshotCaptureToolMain {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
||||||
|
private static final String BROWSERLESS_TOKEN = System.getenv("live-capture.browserless-token");
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
DatabaseModule databaseModule = new DatabaseModule(false);
|
DatabaseModule databaseModule = new DatabaseModule(false);
|
||||||
var ds = databaseModule.provideConnection();
|
var ds = databaseModule.provideConnection();
|
||||||
@@ -107,7 +107,7 @@ public class ScreenshotCaptureToolMain {
|
|||||||
);
|
);
|
||||||
|
|
||||||
var request = HttpRequest.newBuilder()
|
var request = HttpRequest.newBuilder()
|
||||||
.uri(new URI("http://browserless:3000/screenshot"))
|
.uri(new URI("http://browserless:3000/screenshot?token=" + BROWSERLESS_TOKEN))
|
||||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||||
gson.toJson(requestData)
|
gson.toJson(requestData)
|
||||||
))
|
))
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
## This is a token file for automatic deployment
|
## This is a token file for automatic deployment
|
||||||
|
|
||||||
|
2025-01-08: Deploy executor.
|
||||||
2025-01-07: Deploy executor.
|
2025-01-07: Deploy executor.
|
Reference in New Issue
Block a user