1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
4c2810720a (search) Add redirect handler for full URLs in the /site endpoint 2025-02-14 16:31:11 +01:00
Viktor Lofgren
8480ba8daa (live-capture) Code cleanup 2025-02-04 14:05:36 +01:00
Viktor Lofgren
fbba392491 (live-capture) Send a UA-string from the browserless fetcher as well
The change also introduces a somewhat convoluted wiremock test to intercept and verify that these headers are in fact sent
2025-02-04 13:36:49 +01:00
Viktor Lofgren
530eb35949 (update-rss) Do not fail the feed fetcher control actor if it takes a long time to complete. 2025-02-03 11:35:32 +01:00
Viktor Lofgren
c2dd2175a2 (search) Add new query expansion rule contracting WORD NUM pairs into WORD-NUM and WORDNUM 2025-02-01 13:13:30 +01:00
Viktor Lofgren
b8581b0f56 (crawler) Safe sanitization of headers during warc->slop conversion
The warc->slop converter was rejecting some items because they had headers that were representable in the Warc code's MessageHeader map implementation, but illegal in the HttpHeaders' implementation.

Fixing this by manually filtering these out.  Ostensibly the constructor has a filtering predicate, but this annoyingly runs too late and fails to prevent the problem.
2025-01-31 12:47:42 +01:00
Viktor Lofgren
2ea34767d8 (crawler) Use the response URL when resolving relative links
The crawler was incorrectly using the request URL as the base URL when resolving relative links.  This caused problems when encountering redirects.

 For example if we fetch /log, redirecting to  /log/ and find links to foo/, and bar/; these would resolve to /foo and /bar, and not /log/foo and /log/bar.
2025-01-31 12:40:13 +01:00
11 changed files with 142 additions and 20 deletions

View File

@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.NetworkInterface; import java.net.NetworkInterface;
import java.util.Enumeration; import java.util.Enumeration;
@@ -115,7 +116,7 @@ public class ServiceConfigurationModule extends AbstractModule {
} }
} }
public static String getLocalNetworkIP() throws Exception { public static String getLocalNetworkIP() throws IOException {
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces(); Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
while (nets.hasMoreElements()) { while (nets.hasMoreElements()) {

View File

@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.nodecfg.model.NodeProfile; import nu.marginalia.nodecfg.model.NodeProfile;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration; import java.time.Duration;
import java.time.LocalDateTime; import java.time.LocalDateTime;
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
private final NodeConfigurationService nodeConfigurationService; private final NodeConfigurationService nodeConfigurationService;
private final MqPersistence persistence; private final MqPersistence persistence;
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
@Inject @Inject
public UpdateRssActor(Gson gson, public UpdateRssActor(Gson gson,
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateRefresh(int count, long msgId) -> { case UpdateRefresh(int count, long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateRefresh is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateRefresh(count, msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
case UpdateClean(long msgId) -> { case UpdateClean(long msgId) -> {
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12)); MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
if (msg == null) { if (msg == null) {
// Retry the update logger.warn("UpdateClean is taking a very long time");
yield new Error("Failed to update feeds: message not found"); yield new UpdateClean(msgId);
} else if (msg.state() != MqMessageState.OK) { } else if (msg.state() != MqMessageState.OK) {
// Retry the update // Retry the update
yield new Error("Failed to update feeds: " + msg.state()); yield new Error("Failed to update feeds: " + msg.state());

View File

@@ -34,6 +34,7 @@ dependencies {
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.commons.lang3 implementation libs.commons.lang3
implementation libs.commons.io implementation libs.commons.io
implementation libs.wiremock
implementation libs.prometheus implementation libs.prometheus
implementation libs.guava implementation libs.guava

View File

@@ -1,6 +1,7 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.google.gson.Gson; import com.google.gson.Gson;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.time.Duration; import java.time.Duration;
import java.util.Map; import java.util.Map;
import java.util.Optional;
/** Client for local browserless.io API */ /** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable { public class BrowserlessClient implements AutoCloseable {
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
private final URI browserlessURI; private final URI browserlessURI;
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
private final String userAgent = WmsaHome.getUserAgent().uaString();
public BrowserlessClient(URI browserlessURI) { public BrowserlessClient(URI browserlessURI) {
this.browserlessURI = browserlessURI; this.browserlessURI = browserlessURI;
} }
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException { public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
if (rsp.statusCode() >= 300) { if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode()); logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
return null; return Optional.empty();
} }
return rsp.body(); return Optional.of(rsp.body());
} }
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions) public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", url, "url", url,
"userAgent", userAgent,
"options", screenshotOptions, "options", screenshotOptions,
"gotoOptions", gotoOptions "gotoOptions", gotoOptions
); );
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
} }
@Override @Override
public void close() throws Exception { public void close() {
httpClient.shutdownNow(); httpClient.shutdownNow();
} }

View File

@@ -1,5 +1,9 @@
package nu.marginalia.livecapture; package nu.marginalia.livecapture;
import com.github.tomakehurst.wiremock.WireMockServer;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import nu.marginalia.WmsaHome;
import nu.marginalia.service.module.ServiceConfigurationModule;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName; import org.testcontainers.utility.DockerImageName;
import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.Map; import java.util.Map;
import static com.github.tomakehurst.wiremock.client.WireMock.*;
@Testcontainers @Testcontainers
@Tag("slow") @Tag("slow")
public class BrowserlessClientTest { public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")) static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN")) .withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
.withNetworkMode("bridge")
.withExposedPorts(3000); .withExposedPorts(3000);
static WireMockServer wireMockServer =
new WireMockServer(WireMockConfiguration.wireMockConfig()
.port(18089));
static String localIp;
static URI browserlessURI;
@BeforeAll @BeforeAll
public static void setup() { public static void setup() throws IOException {
container.start(); container.start();
browserlessURI = URI.create(String.format("http://%s:%d/",
container.getHost(),
container.getMappedPort(3000))
);
wireMockServer.start();
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
localIp = ServiceConfigurationModule.getLocalNetworkIP();
}
@Tag("flaky")
@Test
public void testInspectContentUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.content("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
}
@Tag("flaky")
@Test
public void testInspectScreenshotUA__Flaky() throws Exception {
try (var client = new BrowserlessClient(browserlessURI)) {
client.screenshot("http://" + localIp + ":18089/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues()
);
}
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
} }
@Test @Test
public void testContent() throws Exception { public void testContent() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()); var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
Assertions.assertNotNull(content, "Content should not be null");
Assertions.assertFalse(content.isBlank(), "Content should not be empty"); Assertions.assertFalse(content.isBlank(), "Content should not be empty");
} }
} }
@Test @Test
public void testScreenshot() throws Exception { public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { try (var client = new BrowserlessClient(browserlessURI)) {
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues()); var screenshot = client.screenshot("https://www.marginalia.nu/",
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues());
Assertions.assertNotNull(screenshot, "Screenshot should not be null"); Assertions.assertNotNull(screenshot, "Screenshot should not be null");
} }
} }

View File

@@ -134,6 +134,10 @@ public class QueryExpansion {
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) { if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
graph.addVariantForSpan(prev, qw, joinedWord); graph.addVariantForSpan(prev, qw, joinedWord);
} }
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
}
} }
prev = qw; prev = qw;

View File

@@ -213,6 +213,18 @@ public class QueryFactoryTest {
System.out.println(subquery); System.out.println(subquery);
} }
@Test
public void testContractionWordNum() {
var subquery = parseAndGetSpecs("glove 80");
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
}
@Test @Test
public void testCplusPlus() { public void testCplusPlus() {
var subquery = parseAndGetSpecs("std::vector::push_back vector"); var subquery = parseAndGetSpecs("std::vector::push_back vector");

View File

@@ -381,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
if (docOpt.isPresent()) { if (docOpt.isPresent()) {
var doc = docOpt.get(); var doc = docOpt.get();
crawlFrontier.enqueueLinksFromDocument(top, doc); var responseUrl = new EdgeUrl(ok.uri());
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
crawlFrontier.addVisited(responseUrl);
} }
} }
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {

View File

@@ -12,8 +12,7 @@ import java.io.InputStream;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.URI; import java.net.URI;
import java.net.http.HttpHeaders; import java.net.http.HttpHeaders;
import java.util.Arrays; import java.util.*;
import java.util.Optional;
/* FIXME: This interface has a very unfortunate name that is not very descriptive. /* FIXME: This interface has a very unfortunate name that is not very descriptive.
*/ */
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
) implements HttpFetchResult { ) implements HttpFetchResult {
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) { public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length); this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
}
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
Map<String, List<String>> inputMap = messageHeaders.map();
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
inputMap.forEach((k, v) -> {
if (k.isBlank()) return;
if (!Character.isAlphabetic(k.charAt(0))) return;
filteredMap.put(k, v);
});
return HttpHeaders.of(filteredMap, (k,v) -> true);
} }
public boolean isOk() { public boolean isOk() {

View File

@@ -3,8 +3,10 @@ package nu.marginalia.search;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.jooby.Context; import io.jooby.Context;
import io.jooby.Jooby; import io.jooby.Jooby;
import io.jooby.StatusCode;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*; import nu.marginalia.search.svc.*;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
@@ -16,6 +18,7 @@ import java.util.List;
public class SearchService extends JoobyService { public class SearchService extends JoobyService {
private final WebsiteUrl websiteUrl;
private final SearchSiteSubscriptionService siteSubscriptionService; private final SearchSiteSubscriptionService siteSubscriptionService;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class); private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@@ -33,6 +36,7 @@ public class SearchService extends JoobyService {
@Inject @Inject
public SearchService(BaseServiceParams params, public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
SearchFrontPageService frontPageService, SearchFrontPageService frontPageService,
SearchAddToCrawlQueueService addToCrawlQueueService, SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteSubscriptionService siteSubscriptionService, SearchSiteSubscriptionService siteSubscriptionService,
@@ -51,6 +55,7 @@ public class SearchService extends JoobyService {
new SearchAddToCrawlQueueService_(addToCrawlQueueService), new SearchAddToCrawlQueueService_(addToCrawlQueueService),
new SearchBrowseService_(searchBrowseService) new SearchBrowseService_(searchBrowseService)
)); ));
this.websiteUrl = websiteUrl;
this.siteSubscriptionService = siteSubscriptionService; this.siteSubscriptionService = siteSubscriptionService;
} }
@@ -62,6 +67,10 @@ public class SearchService extends JoobyService {
final String startTimeAttribute = "start-time"; final String startTimeAttribute = "start-time";
jooby.get("/export-opml", siteSubscriptionService::exportOpml); jooby.get("/export-opml", siteSubscriptionService::exportOpml);
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
jooby.before((Context ctx) -> { jooby.before((Context ctx) -> {
ctx.setAttribute(startTimeAttribute, System.nanoTime()); ctx.setAttribute(startTimeAttribute, System.nanoTime());
}); });
@@ -80,5 +89,19 @@ public class SearchService extends JoobyService {
}); });
} }
/** Redirect handler for the case when the user passes
* an url like /site/https://example.com/, in this
* scenario we want to extract the domain name and redirect
* to /site/example.com/
*/
private Context handleSiteUrlRedirect(Context ctx) {
var pv = ctx.path("*").value();
int trailSlash = pv.indexOf('/');
if (trailSlash > 0) {
pv = pv.substring(0, trailSlash);
}
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
return ctx;
}
} }

View File

@@ -239,6 +239,7 @@ dependencyResolutionManagement {
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion) library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion) library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
library('jte','gg.jte','jte').version('3.1.15') library('jte','gg.jte','jte').version('3.1.15')
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet']) bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])