1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

17 Commits

Author SHA1 Message Date
Viktor Lofgren
be7d13ccce (crawler) Correct task execution logic in crawler
The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.
2025-03-09 13:47:51 +01:00
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
Viktor Lofgren
ea9a642b9b (crawler) More effective task scheduling in the crawler
This should hopefully allow more threads to be busy
2025-03-09 11:44:59 +01:00
Viktor Lofgren
27f528af6a (search) Fix "Remove Javascript" toggle
A bug was introduced at some point where the special keyword for filtering on javascript was changed to special:scripts, from js:true/js:false.

Solves issue #155
2025-02-28 12:03:04 +01:00
Viktor Lofgren
20ca41ec95 (processed model) Use String columns instead of Txt columns for SlopDocumentRecord
It's very likely TxtStringColumn is the culprit of the bug seen in https://github.com/MarginaliaSearch/MarginaliaSearch/issues/154 where the wrong URL was shown for a search result.
2025-02-24 11:41:51 +01:00
Viktor Lofgren
7671f0d9e4 (search) Display message when no search results are found 2025-02-24 11:15:55 +01:00
Viktor Lofgren
44d6bc71b7 (assistant) Migrate to Jooby framework 2025-02-15 13:28:12 +01:00
Viktor Lofgren
9d302e2973 (assistant) Migrate to Jooby framework 2025-02-15 13:26:04 +01:00
Viktor Lofgren
f553701224 (assistant) Migrate to Jooby framework 2025-02-15 13:21:48 +01:00
Viktor Lofgren
f076d05595 (deps) Upgrade slf4j to latest 2025-02-15 12:50:16 +01:00
Viktor Lofgren
b513809710 (*) Stopgap fix for metrics server initialization errors bringing down services 2025-02-14 17:09:48 +01:00
Viktor Lofgren
7519b28e21 (search) Correct exception from misbehaving bots feeding invalid urls 2025-02-14 17:05:24 +01:00
Viktor Lofgren
3eac4dd57f (search) Correct exception in error handler when page is missing 2025-02-14 17:00:21 +01:00
Viktor Lofgren
4c2810720a (search) Add redirect handler for full URLs in the /site endpoint 2025-02-14 16:31:11 +01:00
Viktor Lofgren
8480ba8daa (live-capture) Code cleanup 2025-02-04 14:05:36 +01:00
Viktor Lofgren
fbba392491 (live-capture) Send a UA-string from the browserless fetcher as well
The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent
2025-02-04 13:36:49 +01:00
Viktor Lofgren
530eb35949 (update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete. 2025-02-03 11:35:32 +01:00
23 changed files with 369 additions and 93 deletions

View File

@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.NetworkInterface; import java.net.NetworkInterface;
import java.util.Enumeration; import java.util.Enumeration;
@@ -115,7 +116,7 @@ public class ServiceConfigurationModule extends AbstractModule {
} }
} }
public static String getLocalNetworkIP() throws Exception { public static String getLocalNetworkIP() throws IOException {
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces(); Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
while (nets.hasMoreElements()) { while (nets.hasMoreElements()) {

View File

@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker; import org.slf4j.Marker;
import org.slf4j.MarkerFactory; import org.slf4j.MarkerFactory;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.List; import java.util.List;
@@ -106,9 +107,12 @@ public class JoobyService {
config.externalAddress()); config.externalAddress());
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here // FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled"))); jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
}
if (Files.exists(Path.of("/app/resources/static"))) {
jooby.assets("/*", Paths.get("/app/resources/static")); jooby.assets("/*", Paths.get("/app/resources/static"));
}
var options = new ServerOptions(); var options = new ServerOptions();
options.setHost(config.bindAddress()); options.setHost(config.bindAddress());
options.setPort(restEndpoint.port()); options.setPort(restEndpoint.port());

View File

@@ -6,17 +6,22 @@ import nu.marginalia.service.module.ServiceConfiguration;
import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder; import org.eclipse.jetty.servlet.ServletHolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
public class MetricsServer { public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject @Inject
public MetricsServer(ServiceConfiguration configuration) throws Exception { public MetricsServer(ServiceConfiguration configuration) {
// If less than zero, we forego setting up a metrics server // If less than zero, we forego setting up a metrics server
if (configuration.metricsPort() < 0) if (configuration.metricsPort() < 0)
return; return;
try {
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort())); Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
ServletContextHandler context = new ServletContextHandler(); ServletContextHandler context = new ServletContextHandler();
@@ -27,4 +32,8 @@ public class MetricsServer {
server.start(); server.start();
} }
catch (Exception|NoSuchMethodError ex) {
logger.error("Failed to set up metrics server", ex);
}
}
} }

View File

@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.nodecfg.model.NodeProfile;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration; import java.time.Duration;
import java.time.LocalDateTime; import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
private final NodeConfigurationService nodeConfigurationService; private final NodeConfigurationService nodeConfigurationService;
private final MqPersistence persistence; private final MqPersistence persistence;
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
@Inject @Inject
public UpdateRssActor(Gson gson, public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateRefresh(int count, long msgId) -> { case UpdateRefresh(int count, long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateRefresh is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateRefresh(count, msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateClean(long msgId) -> { case UpdateClean(long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateClean is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateClean(msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());

View File

@@ -34,6 +34,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.commons.io implementation libs.commons.io
implementation libs.wiremock
implementation libs.prometheus implementation libs.prometheus
implementation libs.guava implementation libs.guava

View File

@@ -1,6 +1,7 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.time.Duration; import java.time.Duration;
import java.util.Map; import java.util.Map;
import java.util.Optional;
/** Client for local browserless.io API */ /** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable { public class BrowserlessClient implements AutoCloseable {
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
private final URI browserlessURI; private final URI browserlessURI;
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
private final String userAgent = WmsaHome.getUserAgent().uaString();
public BrowserlessClient(URI browserlessURI) { public BrowserlessClient(URI browserlessURI) {
this.browserlessURI = browserlessURI; this.browserlessURI = browserlessURI;
} }
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException { public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
if (rsp.statusCode() >= 300) { if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode()); logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
return null; return Optional.empty();
} }
return rsp.body(); return Optional.of(rsp.body());
} }
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions) public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"options", screenshotOptions, "options", screenshotOptions,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
} }
@Override @Override
public void close() throws Exception { public void close() {
httpClient.shutdownNow(); httpClient.shutdownNow();
} }

View File

@@ -1,5 +1,9 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.service.module.ServiceConfigurationModule;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName; import org.testcontainers.utility.DockerImageName;
import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.Map; import java.util.Map;
import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers @Testcontainers
@Tag("slow") @Tag("slow")
public class BrowserlessClientTest { public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")) static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN")) .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withNetworkMode("bridge")
.withExposedPorts(3000); .withExposedPorts(3000);
static WireMockServer wireMockServer =
new WireMockServer(WireMockConfiguration.wireMockConfig()
.port(18089));
static String localIp;
static URI browserlessURI;
@BeforeAll @BeforeAll
public static void setup() { public static void setup() throws IOException {
container.start(); container.start();
browserlessURI = URI.create(String.format("http://%s:%d/",
container.getHost(),
container.getMappedPort(3000))
);
wireMockServer.start();
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
localIp = ServiceConfigurationModule.getLocalNetworkIP();
}
@Tag("flaky")
@Test
public void testInspectContentUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.content("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
}
@Tag("flaky")
@Test
public void testInspectScreenshotUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.screenshot("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
} }
@Test @Test
public void testContent() throws Exception { public void testContent() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()); var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
Assertions.assertNotNull(content, "Content should not be null");
Assertions.assertFalse(content.isBlank(), "Content should not be empty"); Assertions.assertFalse(content.isBlank(), "Content should not be empty");
} }
} }
@Test @Test
public void testScreenshot() throws Exception { public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues()); var screenshot = client.screenshot("https://www.marginalia.nu/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues());
Assertions.assertNotNull(screenshot, "Screenshot should not be null"); Assertions.assertNotNull(screenshot, "Screenshot should not be null");
} }
} }

View File

@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn; import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn; import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.column.string.StringColumn; import nu.marginalia.slop.column.string.StringColumn;
import nu.marginalia.slop.column.string.TxtStringColumn;
import nu.marginalia.slop.desc.StorageType; import nu.marginalia.slop.desc.StorageType;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
} }
// Basic information // Basic information
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP); private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP); private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN); private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN); private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP); private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD); private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static class KeywordsProjectionReader extends SlopTable { public static class KeywordsProjectionReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader; private final StringColumn.Reader domainsReader;
private final VarintColumn.Reader ordinalsReader; private final VarintColumn.Reader ordinalsReader;
private final IntColumn.Reader htmlFeaturesReader; private final IntColumn.Reader htmlFeaturesReader;
private final LongColumn.Reader domainMetadataReader; private final LongColumn.Reader domainMetadataReader;
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
} }
public static class MetadataReader extends SlopTable { public static class MetadataReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader; private final StringColumn.Reader domainsReader;
private final TxtStringColumn.Reader urlsReader; private final StringColumn.Reader urlsReader;
private final VarintColumn.Reader ordinalsReader; private final VarintColumn.Reader ordinalsReader;
private final StringColumn.Reader titlesReader; private final StringColumn.Reader titlesReader;
private final StringColumn.Reader descriptionsReader; private final StringColumn.Reader descriptionsReader;
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
} }
public static class Writer extends SlopTable { public static class Writer extends SlopTable {
private final TxtStringColumn.Writer domainsWriter; private final StringColumn.Writer domainsWriter;
private final TxtStringColumn.Writer urlsWriter; private final StringColumn.Writer urlsWriter;
private final VarintColumn.Writer ordinalsWriter; private final VarintColumn.Writer ordinalsWriter;
private final EnumColumn.Writer statesWriter; private final EnumColumn.Writer statesWriter;
private final StringColumn.Writer stateReasonsWriter; private final StringColumn.Writer stateReasonsWriter;

View File

@@ -41,10 +41,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption; import java.nio.file.StandardCopyOption;
import java.security.Security; import java.security.Security;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
// (this happens when the process is restarted after a crash or a shutdown) // (this happens when the process is restarted after a crash or a shutdown)
tasksDone.set(workLog.countFinishedJobs()); tasksDone.set(workLog.countFinishedJobs());
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
// this will more aggressively attempt to schedule the jobs to avoid blocking
List<CrawlTask> deferredTasks = new LinkedList<>();
// Create crawl tasks and submit them to the pool for execution // Create crawl tasks and submit them to the pool for execution
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) { for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
if (workLog.isJobFinished(crawlSpec.domain())) if (workLog.isJobFinished(crawlSpec.domain()))
continue; continue;
var task = new CrawlTask( // Add to the end of the deferral list
deferredTasks.addLast(new CrawlTask(
crawlSpec, crawlSpec,
anchorTagsSource, anchorTagsSource,
outputDir, outputDir,
warcArchiver, warcArchiver,
domainStateDb, domainStateDb,
workLog); workLog));
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) { // Start every task we currently can from the deferral list
pool.submitQuietly(task); deferredTasks.removeIf(task -> {
if (task.canRun()) {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
} }
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
}
return false;
});
}
// Schedule any lingering tasks for immediate execution
for (var task : deferredTasks) {
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
continue;
pool.submitQuietly(task);
} }
logger.info("Shutting down the pool, waiting for tasks to complete..."); logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
this.id = Integer.toHexString(domain.hashCode()); this.id = Integer.toHexString(domain.hashCode());
} }
/** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */
public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain));
}
@Override @Override
public void run() throws Exception { public void run() throws Exception {

View File

@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever(); return new SitemapRetriever();
} }
/** Recursively fetch sitemaps */
@Override @Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) { public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try { try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) { while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst(); var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) { switch (fetchSingleSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> { case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) { for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException { private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder() HttpRequest getRequest = HttpRequest.newBuilder()
.GET() .GET()
.uri(sitemapUrl.asURI()) .uri(sitemapUrl.asURI())

View File

@@ -44,6 +44,14 @@ public class DomainLocks {
return new Semaphore(2); return new Semaphore(2);
} }
public boolean canLock(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem)
return true;
else
return sem.availablePermits() > 0;
}
public static class DomainLock implements AutoCloseable { public static class DomainLock implements AutoCloseable {
private final String domainName; private final String domainName;
private final Semaphore semaphore; private final Semaphore semaphore;

View File

@@ -7,8 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter { public enum SearchJsParameter {
DEFAULT("default"), DEFAULT("default"),
DENY_JS("no-js", "js:true"), DENY_JS("no-js", "special:scripts");
REQUIRE_JS("yes-js", "js:false");
public final String value; public final String value;
public final String[] implictExcludeSearchTerms; public final String[] implictExcludeSearchTerms;
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) { public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS; if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT; return DEFAULT;
} }

View File

@@ -3,8 +3,10 @@ package nu.marginalia.search;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.jooby.Context; import io.jooby.Context;
import io.jooby.Jooby; import io.jooby.Jooby;
import io.jooby.StatusCode;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*; import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
@@ -16,6 +18,7 @@ import java.util.List;
public class SearchService extends JoobyService { public class SearchService extends JoobyService {
private final WebsiteUrl websiteUrl;
private final SearchSiteSubscriptionService siteSubscriptionService; private final SearchSiteSubscriptionService siteSubscriptionService;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class); private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@@ -33,6 +36,7 @@ public class SearchService extends JoobyService {
@Inject @Inject
public SearchService(BaseServiceParams params, public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
SearchFrontPageService frontPageService, SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService, SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService, SearchSiteSubscriptionService siteSubscriptionService,
@@ -51,6 +55,7 @@ public class SearchService extends JoobyService {
new SearchAddToCrawlQueueService_(addToCrawlQueueService), new SearchAddToCrawlQueueService_(addToCrawlQueueService),
new SearchBrowseService_(searchBrowseService) new SearchBrowseService_(searchBrowseService)
)); ));
this.websiteUrl = websiteUrl;
this.siteSubscriptionService = siteSubscriptionService; this.siteSubscriptionService = siteSubscriptionService;
} }
@@ -62,6 +67,10 @@ public class SearchService extends JoobyService {
final String startTimeAttribute = "start-time"; final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml); jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
jooby.before((Context ctx) -> { jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime()); ctx.setAttribute(startTimeAttribute, System.nanoTime());
}); });
@@ -80,5 +89,19 @@ public class SearchService extends JoobyService {
}); });
} }
/** Redirect handler for the case when the user passes
* an url like /site/https://example.com/, in this
* scenario we want to extract the domain name and redirect
* to /site/example.com/
*/
private Context handleSiteUrlRedirect(Context ctx) {
var pv = ctx.path("*").value();
int trailSlash = pv.indexOf('/');
if (trailSlash > 0) {
pv = pv.substring(0, trailSlash);
}
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
return ctx;
}
} }

View File

@@ -7,9 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter { public enum SearchJsParameter {
DEFAULT("default"), DEFAULT("default"),
DENY_JS("no-js", "js:true"), DENY_JS("no-js", "special:scripts");
REQUIRE_JS("yes-js", "js:false");
public final String value; public final String value;
public final String[] implictExcludeSearchTerms; public final String[] implictExcludeSearchTerms;
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) { public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS; if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT; return DEFAULT;
} }

View File

@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
public String renderUrl() { public String renderUrl() {
StringBuilder pathBuilder = new StringBuilder("/search?"); StringBuilder pathBuilder = new StringBuilder("/search?");
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
if (query != null) {
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
}
if (profile != SearchProfile.NO_FILTER) { if (profile != SearchProfile.NO_FILTER) {
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8)); pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
} }

View File

@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
return focusDomainId >= 0; return focusDomainId >= 0;
} }
public boolean isEmpty() {
return results.isEmpty();
}
public SearchFilters getFilters() { public SearchFilters getFilters() {
return filters; return filters;
} }

View File

@@ -56,7 +56,9 @@ public class SearchQueryService {
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error", ex); logger.error("Error", ex);
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page)); return errorPageService.serveError(
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
);
} }
} }

View File

@@ -44,6 +44,11 @@
<div class="grow"></div> <div class="grow"></div>
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a> <a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
</div> </div>
@elseif (results.isEmpty())
<div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
No search results found. Try different search terms, or spelling variations. The search engine currently
only supports queries in the English language.
</div>
@endif @endif
<div class="space-y-4 sm:space-y-6"> <div class="space-y-4 sm:space-y-6">

View File

@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle" apply from: "$rootProject.projectDir/docker.gradle"
dependencies { dependencies {
implementation project(':third-party:symspell')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:functions:live-capture') implementation project(':code:functions:live-capture')
implementation project(':code:functions:live-capture:api') implementation project(':code:functions:live-capture:api')
@@ -32,20 +37,16 @@ dependencies {
implementation project(':code:functions:domain-info') implementation project(':code:functions:domain-info')
implementation project(':code:functions:domain-info:api') implementation project(':code:functions:domain-info:api')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing') implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j implementation project(':third-party:symspell')
implementation libs.bundles.slf4j
implementation libs.prometheus implementation libs.prometheus
implementation libs.commons.io
implementation libs.guava implementation libs.guava
libs.bundles.grpc.get().each { libs.bundles.grpc.get().each {
implementation dependencies.create(it) { implementation dependencies.create(it) {
@@ -59,9 +60,7 @@ dependencies {
implementation dependencies.create(libs.guice.get()) { implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava' exclude group: 'com.google.guava'
} }
implementation dependencies.create(libs.spark.get()) { implementation libs.bundles.jooby
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jetty implementation libs.bundles.jetty
implementation libs.opencsv implementation libs.opencsv
implementation libs.trove implementation libs.trove

View File

@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import io.jooby.ExecutionMode;
import io.jooby.Jooby;
import nu.marginalia.livecapture.LivecaptureModule; import nu.marginalia.livecapture.LivecaptureModule;
import nu.marginalia.service.MainClass; import nu.marginalia.service.MainClass;
import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceId;
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
var configuration = injector.getInstance(ServiceConfiguration.class); var configuration = injector.getInstance(ServiceConfiguration.class);
orchestrateBoot(registry, configuration); orchestrateBoot(registry, configuration);
injector.getInstance(AssistantMain.class); var main = injector.getInstance(AssistantMain.class);
injector.getInstance(Initialization.class).setReady(); injector.getInstance(Initialization.class).setReady();
Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
{
main.start(this);
}
});
}
public void start(Jooby jooby) {
service.startJooby(jooby);
} }
} }

View File

@@ -2,27 +2,27 @@ package nu.marginalia.assistant;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.jooby.Context;
import io.jooby.Jooby;
import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.assistant.suggest.Suggestions;
import nu.marginalia.functions.domains.DomainInfoGrpcService; import nu.marginalia.functions.domains.DomainInfoGrpcService;
import nu.marginalia.functions.math.MathGrpcService; import nu.marginalia.functions.math.MathGrpcService;
import nu.marginalia.livecapture.LiveCaptureGrpcService; import nu.marginalia.livecapture.LiveCaptureGrpcService;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.rss.svc.FeedsGrpcService; import nu.marginalia.rss.svc.FeedsGrpcService;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.SparkService; import nu.marginalia.service.server.JoobyService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.List; import java.util.List;
public class AssistantService extends SparkService { public class AssistantService extends JoobyService {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
@org.jetbrains.annotations.NotNull
private final ScreenshotService screenshotService;
private final Suggestions suggestions; private final Suggestions suggestions;
@Inject @Inject
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
List.of(domainInfoGrpcService, List.of(domainInfoGrpcService,
mathGrpcService, mathGrpcService,
liveCaptureGrpcService, liveCaptureGrpcService,
feedsGrpcService)); feedsGrpcService),
List.of());
this.screenshotService = screenshotService;
this.suggestions = suggestions; this.suggestions = suggestions;
Spark.staticFiles.expireTime(600);
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
Spark.awaitInitialization();
} }
private Object getSuggestions(Request request, Response response) { public void startJooby(Jooby jooby) {
response.type("application/json"); super.startJooby(jooby);
var param = request.queryParams("partial");
if (param == null) { jooby.get("/suggest/", this::getSuggestions);
jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
}
private String getSuggestions(Context context) {
context.setResponseType("application/json");
var param = context.query("partial");
if (param.isMissing()) {
logger.warn("Bad parameter, partial is null"); logger.warn("Bad parameter, partial is null");
Spark.halt(500); context.setResponseCode(500);
return "{}";
} }
return suggestions.getSuggestions(10, param); return gson.toJson(suggestions.getSuggestions(10, param.value()));
}
private String convertToJson(Object o) {
return gson.toJson(o);
} }
} }

View File

@@ -0,0 +1,118 @@
package nu.marginalia.assistant;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import io.jooby.Context;
import nu.marginalia.db.DbDomainQueries;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
public class ScreenshotService {
private final DbDomainQueries domainQueries;
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
this.domainQueries = dbDomainQueries;
this.dataSource = dataSource;
}
public boolean hasScreenshot(int domainId) {
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT TRUE
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, domainId);
var rs = ps.executeQuery();
if (rs.next()) {
return rs.getBoolean(1);
}
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
return false;
}
public Object serveScreenshotRequest(Context context) {
if (Strings.isNullOrEmpty(context.path("id").value(""))) {
context.setResponseCode(404);
return "";
}
int id = context.path("id").intValue();
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT CONTENT_TYPE, DATA
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, id);
var rsp = ps.executeQuery();
if (rsp.next()) {
context.setResponseType(rsp.getString(1));
context.setResponseCode(200);
context.setResponseHeader("Cache-control", "public,max-age=3600");
try (var rs = context.responseStream()) {
IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
}
return "";
}
}
catch (IOException ex) {
logger.warn("IO error", ex);
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
context.setResponseType("image/svg+xml");
var name = domainQueries.getDomain(id).map(Object::toString)
.orElse("[Screenshot Not Yet Captured]");
return """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns="http://www.w3.org/2000/svg"
width="640px"
height="480px"
viewBox="0 0 640 480"
version="1.1">
<g>
<rect
style="fill:#808080"
id="rect288"
width="595.41992"
height="430.01825"
x="23.034981"
y="27.850344" />
<text
xml:space="preserve"
style="font-size:100px;fill:#909090;font-family:sans-serif;"
x="20"
y="120">Placeholder</text>
<text
xml:space="preserve"
style="font-size:32px;fill:#000000;font-family:monospace;"
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
</g>
</svg>
""".formatted(name);
}
}

View File

@@ -160,12 +160,12 @@ dependencyResolutionManagement {
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0') library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0') library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36') library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3') library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2') library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2') library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2') library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
library('notnull','org.jetbrains','annotations').version('24.0.0') library('notnull','org.jetbrains','annotations').version('24.0.0')
@@ -239,6 +239,7 @@ dependencyResolutionManagement {
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion) library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion) library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
library('jte','gg.jte','jte').version('3.1.15') library('jte','gg.jte','jte').version('3.1.15')
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet']) bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])