1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

21 Commits

Author SHA1 Message Date
Viktor Lofgren
e34230c25b (search) Dyslexia fix 2025-01-12 20:39:59 +01:00
Viktor Lofgren
ef3f175ede (search) Don't clobber the search query URL with default values 2025-01-10 15:57:30 +01:00
Viktor Lofgren
bbe4b5d9fd Revert experimental changes 2025-01-10 15:52:02 +01:00
Viktor Lofgren
c67a635103 (search, experimental) Add a few debugging tracks to the search UI 2025-01-10 15:44:44 +01:00
Viktor Lofgren
20b24133fb (search, experimental) Add a few debugging tracks to the search UI 2025-01-10 15:34:48 +01:00
Viktor Lofgren
f2567677e8 (index-client) Clean up index client code
Improve error handling.  This should be a relatively rare case, but we don't want one bad index partition to blow up the entire query.
2025-01-10 15:17:07 +01:00
Viktor Lofgren
bc2c2061f2 (index-client) Clean up index client code
This should have the rpc stream reception be performed in parallel in separate threads, rather blocking sequentially in the main thread, hopefully giving a slight performance boost.
2025-01-10 15:14:42 +01:00
Viktor Lofgren
1c7f5a31a5 (search) Further reduce the number of db queries by adding more caching to DbDomainQueries. 2025-01-10 14:17:29 +01:00
Viktor Lofgren
59a8ea60f7 (search) Further reduce the number of db queries by adding more caching to DbDomainQueries. 2025-01-10 14:15:22 +01:00
Viktor Lofgren
aa9b1244ea (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:56:04 +01:00
Viktor Lofgren
2d17233366 (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:53:56 +01:00
Viktor Lofgren
b245cc9f38 (search) Reduce the number of db queries a bit by caching data that doesn't change too often 2025-01-10 13:46:19 +01:00
Viktor Lofgren
6614d05bdf (db) Make db pool size configurable 2025-01-09 20:20:51 +01:00
Viktor Lofgren
55aeb03c4a (feeds) Replace rssreader based parsing with a custom jsoup based rss parser
This solves some issues with the rssreader based parser, which was very picky about the XML being valid.  Jsoup is much more lenient when parsing malformed XML.
2025-01-09 18:29:55 +01:00
Viktor Lofgren
faa589962f (live-capture) Browserless now requires a token 2025-01-09 14:51:11 +01:00
Viktor Lofgren
c7edd6b39f (live-capture) Browserless now requires a token 2025-01-09 14:46:05 +01:00
Viktor Lofgren
79da622e3b (search) Update front page with new banner about move 2025-01-08 21:38:19 +01:00
Viktor Lofgren
3da8337ba6 (feeds) Add system property for exporting fetched feeds to a slop table for debugging 2025-01-08 20:49:16 +01:00
Viktor Lofgren
a32d230f0a (special) Trigger deployment 2025-01-08 20:07:54 +01:00
Viktor Lofgren
3772bfd387 (query) Fix handling of optional ranking parameters 2025-01-08 17:11:22 +01:00
Viktor Lofgren
02a7900d1a (search) Correct search-in-title toggle in search UI 2025-01-08 16:51:10 +01:00
24 changed files with 423 additions and 391 deletions

View File

@@ -20,7 +20,10 @@ public class DbDomainQueries {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class); private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build(); private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
@Inject @Inject
public DbDomainQueries(HikariDataSource dataSource) public DbDomainQueries(HikariDataSource dataSource)
@@ -30,16 +33,21 @@ public class DbDomainQueries {
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException { public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
try (var connection = dataSource.getConnection()) { try {
return domainIdCache.get(domain, () -> { return domainIdCache.get(domain, () -> {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) { try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
stmt.setString(1, domain.toString()); stmt.setString(1, domain.toString());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
return rsp.getInt(1); return rsp.getInt(1);
} }
} }
catch (SQLException ex) {
throw new RuntimeException(ex);
}
throw new NoSuchElementException(); throw new NoSuchElementException();
}); });
} }
@@ -49,9 +57,6 @@ public class DbDomainQueries {
catch (ExecutionException ex) { catch (ExecutionException ex) {
throw new RuntimeException(ex.getCause()); throw new RuntimeException(ex.getCause());
} }
catch (SQLException ex) {
throw new RuntimeException(ex);
}
} }
public OptionalInt tryGetDomainId(EdgeDomain domain) { public OptionalInt tryGetDomainId(EdgeDomain domain) {
@@ -84,47 +89,55 @@ public class DbDomainQueries {
} }
public Optional<EdgeDomain> getDomain(int id) { public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) {
EdgeDomain existing = domainNameCache.getIfPresent(id);
if (existing != null) {
return Optional.of(existing);
}
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id); stmt.setInt(1, id);
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
return Optional.of(new EdgeDomain(rsp.getString(1))); var val = new EdgeDomain(rsp.getString(1));
domainNameCache.put(id, val);
return Optional.of(val);
} }
return Optional.empty(); return Optional.empty();
} }
} }
catch (UncheckedExecutionException ex) {
throw new RuntimeException(ex.getCause());
}
catch (SQLException ex) { catch (SQLException ex) {
throw new RuntimeException(ex); throw new RuntimeException(ex);
} }
} }
public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) { public List<DomainWithNode> otherSubdomains(EdgeDomain domain, int cnt) throws ExecutionException {
List<DomainWithNode> ret = new ArrayList<>(); String topDomain = domain.topDomain;
try (var conn = dataSource.getConnection(); return siblingsCache.get(topDomain, () -> {
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) { List<DomainWithNode> ret = new ArrayList<>();
stmt.setString(1, domain.topDomain);
stmt.setInt(2, cnt);
var rs = stmt.executeQuery(); try (var conn = dataSource.getConnection();
while (rs.next()) { var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_TOP = ? LIMIT ?")) {
var sibling = new EdgeDomain(rs.getString(1)); stmt.setString(1, topDomain);
stmt.setInt(2, cnt);
if (sibling.equals(domain)) var rs = stmt.executeQuery();
continue; while (rs.next()) {
var sibling = new EdgeDomain(rs.getString(1));
ret.add(new DomainWithNode(sibling, rs.getInt(2))); if (sibling.equals(domain))
continue;
ret.add(new DomainWithNode(sibling, rs.getInt(2)));
}
} catch (SQLException e) {
logger.error("Failed to get domain neighbors");
} }
} catch (SQLException e) { return ret;
logger.error("Failed to get domain neighbors"); });
}
return ret;
} }
public record DomainWithNode (EdgeDomain domain, int nodeAffinity) { public record DomainWithNode (EdgeDomain domain, int nodeAffinity) {

View File

@@ -1,118 +0,0 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.OptionalInt;
/** Class used in exporting data. This is intended to be used for a brief time
* and then discarded, not kept around as a service.
*/
public class DbDomainStatsExportMultitool implements AutoCloseable {
private final Connection connection;
private final int nodeId;
private final PreparedStatement knownUrlsQuery;
private final PreparedStatement visitedUrlsQuery;
private final PreparedStatement goodUrlsQuery;
private final PreparedStatement domainNameToId;
private final PreparedStatement allDomainsQuery;
private final PreparedStatement crawlQueueDomains;
private final PreparedStatement indexedDomainsQuery;
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
this.connection = dataSource.getConnection();
this.nodeId = nodeId;
knownUrlsQuery = connection.prepareStatement("""
SELECT KNOWN_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
visitedUrlsQuery = connection.prepareStatement("""
SELECT VISITED_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
goodUrlsQuery = connection.prepareStatement("""
SELECT GOOD_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
domainNameToId = connection.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME=?
""");
allDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
""");
crawlQueueDomains = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM CRAWL_QUEUE
""");
indexedDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
WHERE INDEXED > 0
""");
}
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, visitedUrlsQuery);
}
public OptionalInt getDomainId(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, domainNameToId);
}
public List<String> getCrawlQueueDomains() throws SQLException {
return executeListQuery(crawlQueueDomains, 100);
}
public List<String> getAllIndexedDomains() throws SQLException {
return executeListQuery(indexedDomainsQuery, 100_000);
}
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
throws SQLException {
statement.setString(1, domainName);
var rs = statement.executeQuery();
if (rs.next()) {
return OptionalInt.of(rs.getInt(1));
}
return OptionalInt.empty();
}
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
List<String> ret = new ArrayList<>(sizeHint);
var rs = statement.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
return ret;
}
@Override
public void close() throws SQLException {
knownUrlsQuery.close();
goodUrlsQuery.close();
visitedUrlsQuery.close();
allDomainsQuery.close();
crawlQueueDomains.close();
domainNameToId.close();
connection.close();
}
}

View File

@@ -89,7 +89,7 @@ public class DatabaseModule extends AbstractModule {
config.addDataSourceProperty("prepStmtCacheSize", "250"); config.addDataSourceProperty("prepStmtCacheSize", "250");
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048"); config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
config.setMaximumPoolSize(5); config.setMaximumPoolSize(Integer.getInteger("db.poolSize", 5));
config.setMinimumIdle(2); config.setMinimumIdle(2);
config.setMaxLifetime(Duration.ofMinutes(9).toMillis()); config.setMaxLifetime(Duration.ofMinutes(9).toMillis());

View File

@@ -29,6 +29,7 @@ dependencies {
implementation libs.jsoup implementation libs.jsoup
implementation project(':third-party:rssreader') implementation project(':third-party:rssreader')
implementation libs.opencsv implementation libs.opencsv
implementation libs.slop
implementation libs.sqlite implementation libs.sqlite
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3

View File

@@ -15,7 +15,9 @@ import java.util.Map;
/** Client for local browserless.io API */ /** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable { public class BrowserlessClient implements AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class); private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
private static final String BROWSERLESS_TOKEN = System.getProperty("live-capture.browserless-token", "BROWSERLESS_TOKEN");
private final HttpClient httpClient = HttpClient.newBuilder() private final HttpClient httpClient = HttpClient.newBuilder()
.version(HttpClient.Version.HTTP_1_1) .version(HttpClient.Version.HTTP_1_1)
@@ -36,7 +38,7 @@ public class BrowserlessClient implements AutoCloseable {
); );
var request = HttpRequest.newBuilder() var request = HttpRequest.newBuilder()
.uri(browserlessURI.resolve("/content")) .uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN))
.method("POST", HttpRequest.BodyPublishers.ofString( .method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData) gson.toJson(requestData)
)) ))
@@ -63,7 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
); );
var request = HttpRequest.newBuilder() var request = HttpRequest.newBuilder()
.uri(browserlessURI.resolve("/screenshot")) .uri(browserlessURI.resolve("/screenshot?token="+BROWSERLESS_TOKEN))
.method("POST", HttpRequest.BodyPublishers.ofString( .method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData) gson.toJson(requestData)
)) ))

View File

@@ -1,6 +1,6 @@
package nu.marginalia.rss.model; package nu.marginalia.rss.model;
import com.apptasticsoftware.rssreader.Item; import nu.marginalia.rss.svc.SimpleFeedParser;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@@ -18,37 +18,33 @@ public record FeedItem(String title,
public static final int MAX_DESC_LENGTH = 255; public static final int MAX_DESC_LENGTH = 255;
public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ"); public static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
public static FeedItem fromItem(Item item, boolean keepFragment) { public static FeedItem fromItem(SimpleFeedParser.ItemData item, boolean keepFragment) {
String title = item.getTitle().orElse(""); String title = item.title();
String date = getItemDate(item); String date = getItemDate(item);
String description = getItemDescription(item); String description = getItemDescription(item);
String url; String url;
if (keepFragment || item.getLink().isEmpty()) { if (keepFragment) {
url = item.getLink().orElse(""); url = item.url();
} }
else { else {
try { try {
String link = item.getLink().get(); String link = item.url();
var linkUri = new URI(link); var linkUri = new URI(link);
var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null); var cleanUri = new URI(linkUri.getScheme(), linkUri.getAuthority(), linkUri.getPath(), linkUri.getQuery(), null);
url = cleanUri.toString(); url = cleanUri.toString();
} }
catch (Exception e) { catch (Exception e) {
// fallback to original link if we can't clean it, this is not a very important step // fallback to original link if we can't clean it, this is not a very important step
url = item.getLink().get(); url = item.url();
} }
} }
return new FeedItem(title, date, description, url); return new FeedItem(title, date, description, url);
} }
private static String getItemDescription(Item item) { private static String getItemDescription(SimpleFeedParser.ItemData item) {
Optional<String> description = item.getDescription(); String rawDescription = item.description();
if (description.isEmpty())
return "";
String rawDescription = description.get();
if (rawDescription.indexOf('<') >= 0) { if (rawDescription.indexOf('<') >= 0) {
rawDescription = Jsoup.parseBodyFragment(rawDescription).text(); rawDescription = Jsoup.parseBodyFragment(rawDescription).text();
} }
@@ -58,15 +54,18 @@ public record FeedItem(String title,
// e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000 // e.g. http://fabiensanglard.net/rss.xml does dates like this: 1 Apr 2021 00:00:00 +0000
private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z"); private static final DateTimeFormatter extraFormatter = DateTimeFormatter.ofPattern("d MMM yyyy HH:mm:ss Z");
private static String getItemDate(Item item) { private static String getItemDate(SimpleFeedParser.ItemData item) {
Optional<ZonedDateTime> zonedDateTime = Optional.empty(); Optional<ZonedDateTime> zonedDateTime = Optional.empty();
try { try {
zonedDateTime = item.getPubDateZonedDateTime(); zonedDateTime = item.getPubDateZonedDateTime();
} }
catch (Exception e) { catch (Exception e) {
zonedDateTime = item.getPubDate() try {
.map(extraFormatter::parse) zonedDateTime = Optional.of(ZonedDateTime.from(extraFormatter.parse(item.pubDate())));
.map(ZonedDateTime::from); }
catch (Exception e2) {
// ignore
}
} }
return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse(""); return zonedDateTime.map(date -> date.format(DATE_FORMAT)).orElse("");

View File

@@ -1,7 +1,5 @@
package nu.marginalia.rss.svc; package nu.marginalia.rss.svc;
import com.apptasticsoftware.rssreader.Item;
import com.apptasticsoftware.rssreader.RssReader;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.opencsv.CSVReader; import com.opencsv.CSVReader;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
@@ -20,7 +18,6 @@ import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.storage.model.FileStorageType;
import nu.marginalia.util.SimpleBlockingThreadPool; import nu.marginalia.util.SimpleBlockingThreadPool;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -32,7 +29,6 @@ import java.net.URISyntaxException;
import java.net.http.HttpClient; import java.net.http.HttpClient;
import java.net.http.HttpRequest; import java.net.http.HttpRequest;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.*; import java.time.*;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
@@ -48,8 +44,6 @@ public class FeedFetcherService {
private static final int MAX_FEED_ITEMS = 10; private static final int MAX_FEED_ITEMS = 10;
private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class); private static final Logger logger = LoggerFactory.getLogger(FeedFetcherService.class);
private final RssReader rssReader = new RssReader();
private final FeedDb feedDb; private final FeedDb feedDb;
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final NodeConfigurationService nodeConfigurationService; private final NodeConfigurationService nodeConfigurationService;
@@ -72,17 +66,6 @@ public class FeedFetcherService {
this.nodeConfigurationService = nodeConfigurationService; this.nodeConfigurationService = nodeConfigurationService;
this.serviceHeartbeat = serviceHeartbeat; this.serviceHeartbeat = serviceHeartbeat;
this.executorClient = executorClient; this.executorClient = executorClient;
// Add support for some alternate date tags for atom
rssReader.addItemExtension("issued", this::setDateFallback);
rssReader.addItemExtension("created", this::setDateFallback);
}
private void setDateFallback(Item item, String value) {
if (item.getPubDate().isEmpty()) {
item.setPubDate(value);
}
} }
public enum UpdateMode { public enum UpdateMode {
@@ -96,6 +79,7 @@ public class FeedFetcherService {
throw new IllegalStateException("Already updating feeds, refusing to start another update"); throw new IllegalStateException("Already updating feeds, refusing to start another update");
} }
try (FeedDbWriter writer = feedDb.createWriter(); try (FeedDbWriter writer = feedDb.createWriter();
HttpClient client = HttpClient.newBuilder() HttpClient client = HttpClient.newBuilder()
.connectTimeout(Duration.ofSeconds(15)) .connectTimeout(Duration.ofSeconds(15))
@@ -103,6 +87,7 @@ public class FeedFetcherService {
.followRedirects(HttpClient.Redirect.NORMAL) .followRedirects(HttpClient.Redirect.NORMAL)
.version(HttpClient.Version.HTTP_2) .version(HttpClient.Version.HTTP_2)
.build(); .build();
FeedJournal feedJournal = FeedJournal.create();
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds") var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
) { ) {
updating = true; updating = true;
@@ -155,6 +140,8 @@ public class FeedFetcherService {
case FetchResult.Success(String value, String etag) -> { case FetchResult.Success(String value, String etag) -> {
writer.saveEtag(feed.domain(), etag); writer.saveEtag(feed.domain(), etag);
writer.saveFeed(parseFeed(value, feed)); writer.saveFeed(parseFeed(value, feed));
feedJournal.record(feed.feedUrl(), value);
} }
case FetchResult.NotModified() -> { case FetchResult.NotModified() -> {
writer.saveEtag(feed.domain(), ifNoneMatchTag); writer.saveEtag(feed.domain(), ifNoneMatchTag);
@@ -367,12 +354,7 @@ public class FeedFetcherService {
public FeedItems parseFeed(String feedData, FeedDefinition definition) { public FeedItems parseFeed(String feedData, FeedDefinition definition) {
try { try {
feedData = sanitizeEntities(feedData); List<SimpleFeedParser.ItemData> rawItems = SimpleFeedParser.parse(feedData);
List<Item> rawItems = rssReader.read(
// Massage the data to maximize the possibility of the flaky XML parser consuming it
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
).toList();
boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems); boolean keepUriFragment = rawItems.size() < 2 || areFragmentsDisparate(rawItems);
@@ -395,33 +377,6 @@ public class FeedFetcherService {
} }
} }
private static final Map<String, String> HTML_ENTITIES = Map.of(
"&raquo;", "»",
"&laquo;", "«",
"&mdash;", "--",
"&ndash;", "-",
"&rsquo;", "'",
"&lsquo;", "'",
"&quot;", "\"",
"&nbsp;", ""
);
/** The XML parser will blow up if you insert HTML entities in the feed XML,
* which is unfortunately relatively common. Replace them as far as is possible
* with their corresponding characters
*/
static String sanitizeEntities(String feedData) {
String result = feedData;
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
result = result.replace(entry.getKey(), entry.getValue());
}
// Handle lone ampersands not part of a recognized XML entity
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&amp;");
return result;
}
/** Decide whether to keep URI fragments in the feed items. /** Decide whether to keep URI fragments in the feed items.
* <p></p> * <p></p>
* We keep fragments if there are multiple different fragments in the items. * We keep fragments if there are multiple different fragments in the items.
@@ -429,16 +384,16 @@ public class FeedFetcherService {
* @param items The items to check * @param items The items to check
* @return True if we should keep the fragments, false otherwise * @return True if we should keep the fragments, false otherwise
*/ */
private boolean areFragmentsDisparate(List<Item> items) { private boolean areFragmentsDisparate(List<SimpleFeedParser.ItemData> items) {
Set<String> seenFragments = new HashSet<>(); Set<String> seenFragments = new HashSet<>();
try { try {
for (var item : items) { for (var item : items) {
if (item.getLink().isEmpty()) { if (item.url().isBlank()) {
continue; continue;
} }
var link = item.getLink().get(); var link = item.url();
if (!link.contains("#")) { if (!link.contains("#")) {
continue; continue;
} }

View File

@@ -0,0 +1,76 @@
package nu.marginalia.rss.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.slop.SlopTable;
import nu.marginalia.slop.column.string.StringColumn;
import nu.marginalia.slop.desc.StorageType;
import org.apache.commons.io.FileUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.function.BiConsumer;
/** Utility for recording fetched feeds to a journal, useful in debugging feed parser issues.
*/
public interface FeedJournal extends AutoCloseable {
StringColumn urlColumn = new StringColumn("url");
StringColumn contentsColumn = new StringColumn("contents", StandardCharsets.UTF_8, StorageType.ZSTD);
void record(String url, String contents) throws IOException;
void close() throws IOException;
static FeedJournal create() throws IOException {
if (Boolean.getBoolean("feedFetcher.persistJournal")) {
Path journalPath = WmsaHome.getDataPath().resolve("feed-journal");
if (Files.isDirectory(journalPath)) {
FileUtils.deleteDirectory(journalPath.toFile());
}
Files.createDirectories(journalPath);
return new RecordingFeedJournal(journalPath);
}
else {
return new NoOpFeedJournal();
}
}
class NoOpFeedJournal implements FeedJournal {
@Override
public void record(String url, String contents) {}
@Override
public void close() {}
}
class RecordingFeedJournal extends SlopTable implements FeedJournal {
private final StringColumn.Writer urlWriter;
private final StringColumn.Writer contentsWriter;
public RecordingFeedJournal(Path path) throws IOException {
super(path, SlopTable.getNumPages(path, FeedJournal.urlColumn));
urlWriter = urlColumn.create(this);
contentsWriter = contentsColumn.create(this);
}
public synchronized void record(String url, String contents) throws IOException {
urlWriter.put(url);
contentsWriter.put(contents);
}
}
static void replay(Path journalPath, BiConsumer<String, String> urlAndContent) throws IOException {
try (SlopTable table = new SlopTable(journalPath)) {
final StringColumn.Reader urlReader = urlColumn.open(table);
final StringColumn.Reader contentsReader = contentsColumn.open(table);
while (urlReader.hasRemaining()) {
urlAndContent.accept(urlReader.get(), contentsReader.get());
}
}
}
}

View File

@@ -0,0 +1,94 @@
package nu.marginalia.rss.svc;
import com.apptasticsoftware.rssreader.DateTimeParser;
import com.apptasticsoftware.rssreader.util.Default;
import org.jsoup.Jsoup;
import org.jsoup.parser.Parser;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
public class SimpleFeedParser {
private static final DateTimeParser dateTimeParser = Default.getDateTimeParser();
public record ItemData (
String title,
String description,
String url,
String pubDate
) {
public boolean isWellFormed() {
return title != null && !title.isBlank() &&
description != null && !description.isBlank() &&
url != null && !url.isBlank() &&
pubDate != null && !pubDate.isBlank();
}
public Optional<ZonedDateTime> getPubDateZonedDateTime() {
try {
return Optional.ofNullable(dateTimeParser.parse(pubDate()));
}
catch (Exception e) {
return Optional.empty();
}
}
}
public static List<ItemData> parse(String content) {
var doc = Jsoup.parse(content, Parser.xmlParser());
List<ItemData> ret = new ArrayList<>();
doc.select("item, entry").forEach(element -> {
String link = "";
String title = "";
String description = "";
String pubDate = "";
for (String attr : List.of("title", "dc:title")) {
if (!title.isBlank())
break;
var tag = element.getElementsByTag(attr).first();
if (tag != null) {
title = tag.text();
}
}
for (String attr : List.of("title", "summary", "content", "description", "dc:description")) {
if (!description.isBlank())
break;
var tag = element.getElementsByTag(attr).first();
if (tag != null) {
description = tag.text();
}
}
for (String attr : List.of("pubDate", "published", "updated", "issued", "created", "dc:date")) {
if (!pubDate.isBlank())
break;
var tag = element.getElementsByTag(attr).first();
if (tag != null) {
pubDate = tag.text();
}
}
for (String attr : List.of("link", "url")) {
if (!link.isBlank())
break;
var tag = element.getElementsByTag(attr).first();
if (tag != null) {
link = tag.text();
}
}
ret.add(new ItemData(title, description, link, pubDate));
});
return ret;
}
}

View File

@@ -2,16 +2,21 @@ package nu.marginalia.livecapture;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.testcontainers.containers.GenericContainer; import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName; import org.testcontainers.utility.DockerImageName;
import java.net.URI; import java.net.URI;
import java.util.Map;
@Testcontainers @Testcontainers
@Tag("slow")
public class BrowserlessClientTest { public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000); static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withExposedPorts(3000);
@BeforeAll @BeforeAll
public static void setup() { public static void setup() {

View File

@@ -1,50 +0,0 @@
package nu.marginalia.rss.svc;
import com.apptasticsoftware.rssreader.Item;
import com.apptasticsoftware.rssreader.RssReader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.List;
import java.util.Optional;
public class TestXmlSanitization {
@Test
public void testPreservedEntities() {
Assertions.assertEquals("&amp;", FeedFetcherService.sanitizeEntities("&amp;"));
Assertions.assertEquals("&lt;", FeedFetcherService.sanitizeEntities("&lt;"));
Assertions.assertEquals("&gt;", FeedFetcherService.sanitizeEntities("&gt;"));
Assertions.assertEquals("&apos;", FeedFetcherService.sanitizeEntities("&apos;"));
}
@Test
public void testNlnetTitleTag() {
// The NLnet atom feed puts HTML tags in the entry/title tags, which breaks the vanilla RssReader code
// Verify we're able to consume and strip out the HTML tags
RssReader r = new RssReader();
List<Item> items = r.read(ClassLoader.getSystemResourceAsStream("nlnet.atom")).toList();
Assertions.assertEquals(1, items.size());
for (var item : items) {
Assertions.assertEquals(Optional.of("50 Free and Open Source Projects Selected for NGI Zero grants"), item.getTitle());
}
}
@Test
public void testStrayAmpersand() {
Assertions.assertEquals("Bed &amp; Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
}
@Test
public void testTranslatedHtmlEntity() {
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo &mdash; Bar"));
}
@Test
public void testTranslatedHtmlEntityQuot() {
Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities("&quot;Bob&quot;"));
}
}

View File

@@ -15,7 +15,10 @@ import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import java.util.*; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class QueryProtobufCodec { public class QueryProtobufCodec {
@@ -42,13 +45,27 @@ public class QueryProtobufCodec {
else else
builder.setQueryStrategy(request.getQueryStrategy()); builder.setQueryStrategy(request.getQueryStrategy());
if (query.specs.rankingParams != null && request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) { if (request.getTemporalBias().getBias() != RpcTemporalBias.Bias.NONE) {
builder.setParameters( if (query.specs.rankingParams != null) {
RpcResultRankingParameters.newBuilder(query.specs.rankingParams) builder.setParameters(
.setTemporalBias(request.getTemporalBias()) RpcResultRankingParameters.newBuilder(query.specs.rankingParams)
.build() .setTemporalBias(request.getTemporalBias())
); .build()
);
} else {
builder.setParameters(
RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults())
.setTemporalBias(request.getTemporalBias())
.build()
);
}
} else if (query.specs.rankingParams != null) {
builder.setParameters(query.specs.rankingParams);
} }
// else {
// if we have no ranking params, we don't need to set them, the client check and use the default values
// so we don't need to send this huge object over the wire
// }
return builder.build(); return builder.build();
} }
@@ -292,7 +309,7 @@ public class QueryProtobufCodec {
IndexProtobufCodec.convertSpecLimit(specs.getRank()), IndexProtobufCodec.convertSpecLimit(specs.getRank()),
specs.getQueryLimits(), specs.getQueryLimits(),
QueryStrategy.valueOf(specs.getQueryStrategy()), QueryStrategy.valueOf(specs.getQueryStrategy()),
Objects.requireNonNullElseGet(specs.getParameters(), PrototypeRankingParameters::sensibleDefaults) specs.hasParameters() ? specs.getParameters() : null
); );
} }

View File

@@ -5,6 +5,7 @@ import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import javax.annotation.Nullable;
import java.util.List; import java.util.List;
public class SearchSpecification { public class SearchSpecification {
@@ -28,6 +29,7 @@ public class SearchSpecification {
public final QueryStrategy queryStrategy; public final QueryStrategy queryStrategy;
@Nullable
public final RpcResultRankingParameters rankingParams; public final RpcResultRankingParameters rankingParams;
public SearchSpecification(SearchQuery query, public SearchSpecification(SearchQuery query,
@@ -40,7 +42,7 @@ public class SearchSpecification {
SpecificationLimit rank, SpecificationLimit rank,
RpcQueryLimits queryLimits, RpcQueryLimits queryLimits,
QueryStrategy queryStrategy, QueryStrategy queryStrategy,
RpcResultRankingParameters rankingParams) @Nullable RpcResultRankingParameters rankingParams)
{ {
this.query = query; this.query = query;
this.domains = domains; this.domains = domains;

View File

@@ -16,20 +16,19 @@ import org.slf4j.LoggerFactory;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import static java.lang.Math.clamp; import java.util.function.Consumer;
@Singleton @Singleton
public class IndexClient { public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class); private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool; private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist; private final DomainBlacklistImpl blacklist;
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor(); private static final ExecutorService executor = Executors.newCachedThreadPool();
@Inject @Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) { public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
@@ -51,40 +50,37 @@ public class IndexClient {
/** Execute a query on the index partitions and return the combined results. */ /** Execute a query on the index partitions and return the combined results. */
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) { public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
.async(executor)
.runEach(indexRequest);
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal(); final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound); AtomicInteger totalNumResults = new AtomicInteger(0);
for (var future : futures) { List<RpcDecoratedResultItem> results =
try { channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
future.get().forEachRemaining(results::add); .async(executor)
} .runEach(indexRequest)
catch (Exception e) { .stream()
logger.error("Downstream exception", e); .map(future -> future.thenApply(iterator -> {
} List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
} iterator.forEachRemaining(ret::add);
totalNumResults.addAndGet(ret.size());
return ret;
}))
.mapMulti((CompletableFuture<List<RpcDecoratedResultItem>> fut, Consumer<List<RpcDecoratedResultItem>> c) ->{
try {
c.accept(fut.join());
} catch (Exception e) {
logger.error("Error while fetching results", e);
}
})
.flatMap(List::stream)
.filter(item -> !isBlacklisted(item))
.sorted(comparator)
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
.limit(pagination.pageSize)
.toList();
// Sort the results by ranking score and remove blacklisted domains return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
results.sort(comparator);
results.removeIf(this::isBlacklisted);
int numReceivedResults = results.size();
// pagination is typically 1-indexed, so we need to adjust the start and end indices
int indexStart = (pagination.page - 1) * pagination.pageSize;
int indexEnd = (pagination.page) * pagination.pageSize;
results = results.subList(
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
clamp(indexEnd, 0, results.size()));
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
} }
private boolean isBlacklisted(RpcDecoratedResultItem item) { private boolean isBlacklisted(RpcDecoratedResultItem item) {

View File

@@ -13,8 +13,6 @@ import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.index.searchset.SearchSet;
import java.util.Objects;
import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit; import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit;
public class SearchParameters { public class SearchParameters {
@@ -88,7 +86,7 @@ public class SearchParameters {
compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery);
compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId);
rankingParams = Objects.requireNonNullElseGet(request.getParameters(), PrototypeRankingParameters::sensibleDefaults); rankingParams = request.hasParameters() ? request.getParameters() : PrototypeRankingParameters.sensibleDefaults();
} }

View File

@@ -84,18 +84,33 @@ public record SearchParameters(WebsiteUrl url,
} }
public String renderUrl() { public String renderUrl() {
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
URLEncoder.encode(query, StandardCharsets.UTF_8),
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
Boolean.valueOf(newFilter).toString(),
page
);
return path; StringBuilder pathBuilder = new StringBuilder("/search?");
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
if (profile != SearchProfile.NO_FILTER) {
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
}
if (js != SearchJsParameter.DEFAULT) {
pathBuilder.append("&js=").append(URLEncoder.encode(js.value, StandardCharsets.UTF_8));
}
if (adtech != SearchAdtechParameter.DEFAULT) {
pathBuilder.append("&adtech=").append(URLEncoder.encode(adtech.value, StandardCharsets.UTF_8));
}
if (recent != SearchRecentParameter.DEFAULT) {
pathBuilder.append("&recent=").append(URLEncoder.encode(recent.value, StandardCharsets.UTF_8));
}
if (searchTitle != SearchTitleParameter.DEFAULT) {
pathBuilder.append("&searchTitle=").append(URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8));
}
if (page != 1) {
pathBuilder.append("&page=").append(page);
}
if (newFilter) {
pathBuilder.append("&newfilter=").append(Boolean.valueOf(newFilter).toString());
}
return pathBuilder.toString();
} }
public RpcTemporalBias.Bias temporalBias() { public RpcTemporalBias.Bias temporalBias() {

View File

@@ -3,27 +3,22 @@ package nu.marginalia.search.command.commands;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.jooby.MapModelAndView; import io.jooby.MapModelAndView;
import io.jooby.ModelAndView; import io.jooby.ModelAndView;
import nu.marginalia.search.JteRenderer;
import nu.marginalia.search.SearchOperator; import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.command.SearchCommandInterface; import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.DecoratedSearchResults; import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.model.NavbarModel; import nu.marginalia.search.model.NavbarModel;
import java.io.IOException;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
public class SearchCommand implements SearchCommandInterface { public class SearchCommand implements SearchCommandInterface {
private final SearchOperator searchOperator; private final SearchOperator searchOperator;
private final JteRenderer jteRenderer;
@Inject @Inject
public SearchCommand(SearchOperator searchOperator, public SearchCommand(SearchOperator searchOperator){
JteRenderer jteRenderer) throws IOException {
this.searchOperator = searchOperator; this.searchOperator = searchOperator;
this.jteRenderer = jteRenderer;
} }
@Override @Override

View File

@@ -37,7 +37,7 @@ public class SearchQueryService {
@QueryParam String profile, @QueryParam String profile,
@QueryParam String js, @QueryParam String js,
@QueryParam String recent, @QueryParam String recent,
@QueryParam String title, @QueryParam String searchTitle,
@QueryParam String adtech, @QueryParam String adtech,
@QueryParam Integer page @QueryParam Integer page
) { ) {
@@ -47,7 +47,7 @@ public class SearchQueryService {
SearchProfile.getSearchProfile(profile), SearchProfile.getSearchProfile(profile),
SearchJsParameter.parse(js), SearchJsParameter.parse(js),
SearchRecentParameter.parse(recent), SearchRecentParameter.parse(recent),
SearchTitleParameter.parse(title), SearchTitleParameter.parse(searchTitle),
SearchAdtechParameter.parse(adtech), SearchAdtechParameter.parse(adtech),
false, false,
Objects.requireNonNullElse(page,1)); Objects.requireNonNullElse(page,1));

View File

@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Supplier; import java.util.function.Supplier;
@@ -67,8 +68,12 @@ public class SearchSiteInfoService {
this.screenshotService = screenshotService; this.screenshotService = screenshotService;
this.dataSource = dataSource; this.dataSource = dataSource;
this.searchSiteSubscriptions = searchSiteSubscriptions; this.searchSiteSubscriptions = searchSiteSubscriptions;
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
} }
private volatile SiteOverviewModel cachedOverviewModel = new SiteOverviewModel(List.of());
@GET @GET
@Path("/site") @Path("/site")
public ModelAndView<?> handleOverview(@QueryParam String domain) { public ModelAndView<?> handleOverview(@QueryParam String domain) {
@@ -77,23 +82,48 @@ public class SearchSiteInfoService {
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain)); return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
} }
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
}
}
catch (SQLException ex) {
throw new RuntimeException();
}
return new MapModelAndView("siteinfo/start.jte", return new MapModelAndView("siteinfo/start.jte",
Map.of("navbar", NavbarModel.SITEINFO, Map.of("navbar", NavbarModel.SITEINFO,
"model", new SiteOverviewModel(domains))); "model", cachedOverviewModel));
}
private void modelUpdater() {
while (!Thread.interrupted()) {
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
// This query can be quite expensive, so we can't run it on demand
// for every request. Instead, we run it every 15 minutes and cache
// the result.
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME, DISCOVER_DATE
FROM EC_DOMAIN
WHERE NODE_AFFINITY = 0
ORDER BY ID DESC
LIMIT 10
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(
rs.getString("DOMAIN_NAME"),
rs.getString("DISCOVER_DATE"))
);
}
} catch (SQLException ex) {
logger.warn("Failed to get recently added domains: {}", ex.getMessage());
}
cachedOverviewModel = new SiteOverviewModel(domains);
try {
TimeUnit.MINUTES.sleep(15);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
} }
public record SiteOverviewModel(List<DiscoveredDomain> domains) { public record SiteOverviewModel(List<DiscoveredDomain> domains) {
@@ -107,7 +137,7 @@ public class SearchSiteInfoService {
@PathParam String domainName, @PathParam String domainName,
@QueryParam String view, @QueryParam String view,
@QueryParam Integer page @QueryParam Integer page
) throws SQLException { ) throws SQLException, ExecutionException {
if (null == domainName || domainName.isBlank()) { if (null == domainName || domainName.isBlank()) {
return null; return null;
@@ -193,7 +223,7 @@ public class SearchSiteInfoService {
); );
} }
private SiteInfoWithContext listInfo(Context context, String domainName) { private SiteInfoWithContext listInfo(Context context, String domainName) throws ExecutionException {
var domain = new EdgeDomain(domainName); var domain = new EdgeDomain(domainName);
final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1); final int domainId = domainQueries.tryGetDomainId(domain).orElse(-1);

View File

@@ -36,10 +36,11 @@
</div> </div>
@if (filters.showRecentOption.isSet()) <input type="hidden" name="js" value="${filters.removeJsOption.value()}"> @endif
@if (filters.reduceAdtechOption.isSet()) <input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}"> @endif
@if (filters.searchTitleOption.isSet()) <input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}"> @endif
@if (filters.showRecentOption.isSet()) <input type="hidden" name="recent" value="${filters.showRecentOption.value()}"> @endif
<input type="hidden" name="js" value="${filters.removeJsOption.value()}">
<input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}">
<input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}">
<input type="hidden" name="profile" value="${profile}"> <input type="hidden" name="profile" value="${profile}">
<input type="hidden" name="recent" value="${filters.showRecentOption.value()}">
</form> </form>

View File

@@ -34,12 +34,12 @@
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w"> <div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 "> <div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
<div class="text-slate-700 dark:text-white text-sm p-4"> <div class="text-slate-700 dark:text-white text-sm p-4">
<div class="fas fa-wrench mr-1 text-margeblue dark:text-slate-200"></div> <div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
This is the new design and home of Marginalia Search. Migration to the new domain <pre class="inline text-red-800 dark:text-red-100">marginalia-search.com</pre> is currently <em>in progress</em>, This is the new design and home of Marginalia Search.
so mind that some things may be a bit broken for a day or two. <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">Read more</a>. You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
<p class="my-4"></p> <p class="my-4"></p>
If you have any issues or feedback regarding this change, please email The old version of Marginalia Search remains available at
<a href="mailto:contact@marginalia-search.com" class="underline text-liteblue dark:text-blue-200">contact@marginalia-search.com</a>. <a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
</div> </div>
</div> </div>
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2"> <div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">

View File

@@ -1,5 +1,4 @@
@import nu.marginalia.db.DbDomainQueries @import nu.marginalia.db.DbDomainQueries
@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService @import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.* @import nu.marginalia.search.svc.SearchSiteInfoService.*
@import nu.marginalia.search.model.UrlDetails @import nu.marginalia.search.model.UrlDetails
@@ -81,35 +80,6 @@
@endif @endif
@if (!siteInfo.siblingDomains().isEmpty())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-globe"></i>
<span>Related Subdomains</span>
</div>
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
<thead>
<tr class="bg-gray-50 dark:bg-gray-700">
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor
</tbody>
</table>
@endif
@if (siteInfo.domainInformation().isUnknownDomain()) @if (siteInfo.domainInformation().isUnknownDomain())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded"> <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fa-regular fa-circle-question"></i> <i class="fa-regular fa-circle-question"></i>
@@ -178,6 +148,36 @@
</form> </form>
@endif @endif
@if (!siteInfo.siblingDomains().isEmpty())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-globe"></i>
<span>Related Subdomains</span>
</div>
<table class="min-w-full divide-y divide-gray-200 dark:divide-gray-600 mx-4">
<thead>
<tr class="bg-gray-50 dark:bg-gray-700">
<th scope="col" class="px-2 py-2 text-left text-xs font-medium text-gray-500 dark:text-gray-100 uppercase tracking-wider">Domain Name</th>
</tr>
</thead>
<tbody class="bg-white dark:bg-gray-800 divide-y divide-gray-200 dark:divide-gray-600 text-xs">
@for (DbDomainQueries.DomainWithNode sibling : siteInfo.siblingDomains())
<tr>
<td class="px-3 py-6 md:py-3 whitespace-nowrap">
<a class="text-liteblue dark:text-blue-200" href="/site/${sibling.domain().toString()}">${sibling.domain().toString()}</a>
@if (!sibling.isIndexed())
<i class="ml-1 fa-regular fa-question-circle text-gray-400 dark:text-gray-600 text-xs" title="Not indexed"></i>
@endif
</td>
</tr>
@endfor
</tbody>
</table>
@endif
@if (siteInfo.isKnown()) @if (siteInfo.isKnown())
<div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded"> <div class="mx-3 flex place-items-baseline space-x-2 p-2 bg-gray-100 dark:bg-gray-600 rounded">
<i class="fas fa-chart-simple"></i> <i class="fas fa-chart-simple"></i>

View File

@@ -28,7 +28,7 @@ import java.util.concurrent.TimeUnit;
public class ScreenshotCaptureToolMain { public class ScreenshotCaptureToolMain {
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class); private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
private static final String BROWSERLESS_TOKEN = System.getenv("live-capture.browserless-token");
public static void main(String[] args) { public static void main(String[] args) {
DatabaseModule databaseModule = new DatabaseModule(false); DatabaseModule databaseModule = new DatabaseModule(false);
var ds = databaseModule.provideConnection(); var ds = databaseModule.provideConnection();
@@ -107,7 +107,7 @@ public class ScreenshotCaptureToolMain {
); );
var request = HttpRequest.newBuilder() var request = HttpRequest.newBuilder()
.uri(new URI("http://browserless:3000/screenshot")) .uri(new URI("http://browserless:3000/screenshot?token=" + BROWSERLESS_TOKEN))
.method("POST", HttpRequest.BodyPublishers.ofString( .method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData) gson.toJson(requestData)
)) ))

View File

@@ -1,3 +1,4 @@
## This is a token file for automatic deployment ## This is a token file for automatic deployment
2025-01-08: Deploy executor.
2025-01-07: Deploy executor. 2025-01-07: Deploy executor.