1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

8 Commits

Author SHA1 Message Date
Viktor Lofgren
f076d05595 (deps) Upgrade slf4j to latest 2025-02-15 12:50:16 +01:00
Viktor Lofgren
b513809710 (*) Stopgap fix for metrics server initialization errors bringing down services 2025-02-14 17:09:48 +01:00
Viktor Lofgren
7519b28e21 (search) Correct exception from misbehaving bots feeding invalid urls 2025-02-14 17:05:24 +01:00
Viktor Lofgren
3eac4dd57f (search) Correct exception in error handler when page is missing 2025-02-14 17:00:21 +01:00
Viktor Lofgren
4c2810720a (search) Add redirect handler for full URLs in the /site endpoint 2025-02-14 16:31:11 +01:00
Viktor Lofgren
8480ba8daa (live-capture) Code cleanup 2025-02-04 14:05:36 +01:00
Viktor Lofgren
fbba392491 (live-capture) Send a UA-string from the browserless fetcher as well
The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent
2025-02-04 13:36:49 +01:00
Viktor Lofgren
530eb35949 (update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete. 2025-02-03 11:35:32 +01:00
11 changed files with 141 additions and 35 deletions

View File

@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress;
import java.net.NetworkInterface;
import java.util.Enumeration;
@@ -115,7 +116,7 @@ public class ServiceConfigurationModule extends AbstractModule {
}
}
public static String getLocalNetworkIP() throws Exception {
public static String getLocalNetworkIP() throws IOException {
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
while (nets.hasMoreElements()) {

View File

@@ -6,25 +6,34 @@ import nu.marginalia.service.module.ServiceConfiguration;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetSocketAddress;
public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject
public MetricsServer(ServiceConfiguration configuration) throws Exception {
public MetricsServer(ServiceConfiguration configuration) {
// If less than zero, we forego setting up a metrics server
if (configuration.metricsPort() < 0)
return;
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
try {
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
ServletContextHandler context = new ServletContextHandler();
context.setContextPath("/");
server.setHandler(context);
ServletContextHandler context = new ServletContextHandler();
context.setContextPath("/");
server.setHandler(context);
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
server.start();
server.start();
}
catch (Exception|NoSuchMethodError ex) {
logger.error("Failed to set up metrics server", ex);
}
}
}

View File

@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.nodecfg.model.NodeProfile;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
private final NodeConfigurationService nodeConfigurationService;
private final MqPersistence persistence;
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
@Inject
public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateRefresh(int count, long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) {
// Retry the update
yield new Error("Failed to update feeds: message not found");
logger.warn("UpdateRefresh is taking a very long time");
yield new UpdateRefresh(count, msgId);
} else if (msg.state() != MqMessageState.OK) {
// Retry the update
yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateClean(long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) {
// Retry the update
yield new Error("Failed to update feeds: message not found");
logger.warn("UpdateClean is taking a very long time");
yield new UpdateClean(msgId);
} else if (msg.state() != MqMessageState.OK) {
// Retry the update
yield new Error("Failed to update feeds: " + msg.state());

View File

@@ -34,6 +34,7 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.commons.lang3
implementation libs.commons.io
implementation libs.wiremock
implementation libs.prometheus
implementation libs.guava

View File

@@ -1,6 +1,7 @@
package nu.marginalia.livecapture;
import com.google.gson.Gson;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.Map;
import java.util.Optional;
/** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable {
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
private final URI browserlessURI;
private final Gson gson = GsonFactory.get();
private final String userAgent = WmsaHome.getUserAgent().uaString();
public BrowserlessClient(URI browserlessURI) {
this.browserlessURI = browserlessURI;
}
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of(
"url", url,
"userAgent", userAgent,
"gotoOptions", gotoOptions
);
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
return null;
return Optional.empty();
}
return rsp.body();
return Optional.of(rsp.body());
}
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
Map<String, Object> requestData = Map.of(
"url", url,
"userAgent", userAgent,
"options", screenshotOptions,
"gotoOptions", gotoOptions
);
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
}
@Override
public void close() throws Exception {
public void close() {
httpClient.shutdownNow();
}

View File

@@ -1,5 +1,9 @@
package nu.marginalia.livecapture;
import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.service.module.ServiceConfigurationModule;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers
@Tag("slow")
public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withNetworkMode("bridge")
.withExposedPorts(3000);
static WireMockServer wireMockServer =
new WireMockServer(WireMockConfiguration.wireMockConfig()
.port(18089));
static String localIp;
static URI browserlessURI;
@BeforeAll
public static void setup() {
public static void setup() throws IOException {
container.start();
browserlessURI = URI.create(String.format("http://%s:%d/",
container.getHost(),
container.getMappedPort(3000))
);
wireMockServer.start();
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
localIp = ServiceConfigurationModule.getLocalNetworkIP();
}
@Tag("flaky")
@Test
public void testInspectContentUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.content("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
}
@Tag("flaky")
@Test
public void testInspectScreenshotUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.screenshot("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
}
@Test
public void testContent() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
Assertions.assertNotNull(content, "Content should not be null");
try (var client = new BrowserlessClient(browserlessURI)) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
}
}
@Test
public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
try (var client = new BrowserlessClient(browserlessURI)) {
var screenshot = client.screenshot("https://www.marginalia.nu/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues());
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
}
}

View File

@@ -3,8 +3,10 @@ package nu.marginalia.search;
import com.google.inject.Inject;
import io.jooby.Context;
import io.jooby.Jooby;
import io.jooby.StatusCode;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams;
@@ -16,6 +18,7 @@ import java.util.List;
public class SearchService extends JoobyService {
private final WebsiteUrl websiteUrl;
private final SearchSiteSubscriptionService siteSubscriptionService;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@@ -33,6 +36,7 @@ public class SearchService extends JoobyService {
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService,
@@ -51,6 +55,7 @@ public class SearchService extends JoobyService {
new SearchAddToCrawlQueueService_(addToCrawlQueueService),
new SearchBrowseService_(searchBrowseService)
));
this.websiteUrl = websiteUrl;
this.siteSubscriptionService = siteSubscriptionService;
}
@@ -62,6 +67,10 @@ public class SearchService extends JoobyService {
final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime());
});
@@ -80,5 +89,19 @@ public class SearchService extends JoobyService {
});
}
/** Redirect handler for the case when the user passes
* an url like /site/https://example.com/, in this
* scenario we want to extract the domain name and redirect
* to /site/example.com/
*/
private Context handleSiteUrlRedirect(Context ctx) {
var pv = ctx.path("*").value();
int trailSlash = pv.indexOf('/');
if (trailSlash > 0) {
pv = pv.substring(0, trailSlash);
}
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
return ctx;
}
}

View File

@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
public String renderUrl() {
StringBuilder pathBuilder = new StringBuilder("/search?");
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
if (query != null) {
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
}
if (profile != SearchProfile.NO_FILTER) {
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
}

View File

@@ -56,7 +56,9 @@ public class SearchQueryService {
}
catch (Exception ex) {
logger.error("Error", ex);
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page));
return errorPageService.serveError(
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
);
}
}

View File

@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle"
dependencies {
implementation project(':third-party:symspell')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:functions:live-capture')
implementation project(':code:functions:live-capture:api')
@@ -32,19 +37,16 @@ dependencies {
implementation project(':code:functions:domain-info')
implementation project(':code:functions:domain-info:api')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j
implementation project(':third-party:symspell')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.guava
libs.bundles.grpc.get().each {

View File

@@ -160,12 +160,12 @@ dependencyResolutionManagement {
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
library('notnull','org.jetbrains','annotations').version('24.0.0')
@@ -239,6 +239,7 @@ dependencyResolutionManagement {
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
library('jte','gg.jte','jte').version('3.1.15')
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])