1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

12 Commits

Author SHA1 Message Date
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
Viktor Lofgren
ea9a642b9b (crawler) More effective task scheduling in the crawler
This should hopefully allow more threads to be busy
2025-03-09 11:44:59 +01:00
Viktor Lofgren
27f528af6a (search) Fix "Remove Javascript" toggle
A bug was introduced at some point where the special keyword for filtering on javascript was changed to special:scripts, from js:true/js:false.

Solves issue #155
2025-02-28 12:03:04 +01:00
Viktor Lofgren
20ca41ec95 (processed model) Use String columns instead of Txt columns for SlopDocumentRecord
It's very likely TxtStringColumn is the culprit of the bug seen in https://github.com/MarginaliaSearch/MarginaliaSearch/issues/154 where the wrong URL was shown for a search result.
2025-02-24 11:41:51 +01:00
Viktor Lofgren
7671f0d9e4 (search) Display message when no search results are found 2025-02-24 11:15:55 +01:00
Viktor Lofgren
44d6bc71b7 (assistant) Migrate to Jooby framework 2025-02-15 13:28:12 +01:00
Viktor Lofgren
9d302e2973 (assistant) Migrate to Jooby framework 2025-02-15 13:26:04 +01:00
Viktor Lofgren
f553701224 (assistant) Migrate to Jooby framework 2025-02-15 13:21:48 +01:00
Viktor Lofgren
f076d05595 (deps) Upgrade slf4j to latest 2025-02-15 12:50:16 +01:00
Viktor Lofgren
b513809710 (*) Stopgap fix for metrics server initialization errors bringing down services 2025-02-14 17:09:48 +01:00
Viktor Lofgren
7519b28e21 (search) Correct exception from misbehaving bots feeding invalid urls 2025-02-14 17:05:24 +01:00
Viktor Lofgren
3eac4dd57f (search) Correct exception in error handler when page is missing 2025-02-14 17:00:21 +01:00
17 changed files with 263 additions and 78 deletions

View File

@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
@@ -106,9 +107,12 @@ public class JoobyService {
config.externalAddress());
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
jooby.assets("/*", Paths.get("/app/resources/static"));
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
}
if (Files.exists(Path.of("/app/resources/static"))) {
jooby.assets("/*", Paths.get("/app/resources/static"));
}
var options = new ServerOptions();
options.setHost(config.bindAddress());
options.setPort(restEndpoint.port());

View File

@@ -6,25 +6,34 @@ import nu.marginalia.service.module.ServiceConfiguration;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.servlet.ServletHolder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetSocketAddress;
public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject
public MetricsServer(ServiceConfiguration configuration) throws Exception {
public MetricsServer(ServiceConfiguration configuration) {
// If less than zero, we forego setting up a metrics server
if (configuration.metricsPort() < 0)
return;
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
try {
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
ServletContextHandler context = new ServletContextHandler();
context.setContextPath("/");
server.setHandler(context);
ServletContextHandler context = new ServletContextHandler();
context.setContextPath("/");
server.setHandler(context);
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
server.start();
server.start();
}
catch (Exception|NoSuchMethodError ex) {
logger.error("Failed to set up metrics server", ex);
}
}
}

View File

@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.string.EnumColumn;
import nu.marginalia.slop.column.string.StringColumn;
import nu.marginalia.slop.column.string.TxtStringColumn;
import nu.marginalia.slop.desc.StorageType;
import org.jetbrains.annotations.Nullable;
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
}
// Basic information
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
public static class KeywordsProjectionReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader;
private final StringColumn.Reader domainsReader;
private final VarintColumn.Reader ordinalsReader;
private final IntColumn.Reader htmlFeaturesReader;
private final LongColumn.Reader domainMetadataReader;
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
}
public static class MetadataReader extends SlopTable {
private final TxtStringColumn.Reader domainsReader;
private final TxtStringColumn.Reader urlsReader;
private final StringColumn.Reader domainsReader;
private final StringColumn.Reader urlsReader;
private final VarintColumn.Reader ordinalsReader;
private final StringColumn.Reader titlesReader;
private final StringColumn.Reader descriptionsReader;
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
}
public static class Writer extends SlopTable {
private final TxtStringColumn.Writer domainsWriter;
private final TxtStringColumn.Writer urlsWriter;
private final StringColumn.Writer domainsWriter;
private final StringColumn.Writer urlsWriter;
private final VarintColumn.Writer ordinalsWriter;
private final EnumColumn.Writer statesWriter;
private final StringColumn.Writer stateReasonsWriter;

View File

@@ -41,10 +41,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.Security;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
// (this happens when the process is restarted after a crash or a shutdown)
tasksDone.set(workLog.countFinishedJobs());
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
// this will more aggressively attempt to schedule the jobs to avoid blocking
List<CrawlTask> deferredTasks = new LinkedList<>();
// Create crawl tasks and submit them to the pool for execution
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
if (workLog.isJobFinished(crawlSpec.domain()))
continue;
var task = new CrawlTask(
// Add to the end of the deferral list
deferredTasks.addLast(new CrawlTask(
crawlSpec,
anchorTagsSource,
outputDir,
warcArchiver,
domainStateDb,
workLog);
workLog));
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
pool.submitQuietly(task);
}
// Start every task we currently can from the deferral list
deferredTasks.removeIf(task -> {
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
return true; // task has already run, duplicate in crawl specs
}
if (task.canRun()) {
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
}
return false;
});
}
// Schedule any lingering tasks
for (var task : deferredTasks) {
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
continue;
pool.submitQuietly(task);
}
logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
this.id = Integer.toHexString(domain.hashCode());
}
/** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */
public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain));
}
@Override
public void run() throws Exception {

View File

@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever();
}
/** Recursively fetch sitemaps */
@Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) {
switch (fetchSingleSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
}
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder()
.GET()
.uri(sitemapUrl.asURI())

View File

@@ -44,6 +44,14 @@ public class DomainLocks {
return new Semaphore(2);
}
public boolean canLock(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem)
return true;
else
return sem.availablePermits() > 0;
}
public static class DomainLock implements AutoCloseable {
private final String domainName;
private final Semaphore semaphore;

View File

@@ -7,8 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
DENY_JS("no-js", "special:scripts");
public final String value;
public final String[] implictExcludeSearchTerms;
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}

View File

@@ -7,9 +7,7 @@ import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
DENY_JS("no-js", "special:scripts");
public final String value;
public final String[] implictExcludeSearchTerms;
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}

View File

@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
public String renderUrl() {
StringBuilder pathBuilder = new StringBuilder("/search?");
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
if (query != null) {
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
}
if (profile != SearchProfile.NO_FILTER) {
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
}

View File

@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
return focusDomainId >= 0;
}
public boolean isEmpty() {
return results.isEmpty();
}
public SearchFilters getFilters() {
return filters;
}

View File

@@ -56,7 +56,9 @@ public class SearchQueryService {
}
catch (Exception ex) {
logger.error("Error", ex);
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page));
return errorPageService.serveError(
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
);
}
}

View File

@@ -44,6 +44,11 @@
<div class="grow"></div>
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
</div>
@elseif (results.isEmpty())
<div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
No search results found. Try different search terms, or spelling variations. The search engine currently
only supports queries in the English language.
</div>
@endif
<div class="space-y-4 sm:space-y-6">

View File

@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle"
dependencies {
implementation project(':third-party:symspell')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:functions:live-capture')
implementation project(':code:functions:live-capture:api')
@@ -32,20 +37,16 @@ dependencies {
implementation project(':code:functions:domain-info')
implementation project(':code:functions:domain-info:api')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:features-search:screenshots')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j
implementation project(':third-party:symspell')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.commons.io
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
@@ -59,9 +60,7 @@ dependencies {
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation dependencies.create(libs.spark.get()) {
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jooby
implementation libs.bundles.jetty
implementation libs.opencsv
implementation libs.trove

View File

@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import io.jooby.ExecutionMode;
import io.jooby.Jooby;
import nu.marginalia.livecapture.LivecaptureModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.ServiceId;
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
var configuration = injector.getInstance(ServiceConfiguration.class);
orchestrateBoot(registry, configuration);
injector.getInstance(AssistantMain.class);
var main = injector.getInstance(AssistantMain.class);
injector.getInstance(Initialization.class).setReady();
Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
{
main.start(this);
}
});
}
public void start(Jooby jooby) {
service.startJooby(jooby);
}
}

View File

@@ -2,27 +2,27 @@ package nu.marginalia.assistant;
import com.google.gson.Gson;
import com.google.inject.Inject;
import io.jooby.Context;
import io.jooby.Jooby;
import nu.marginalia.assistant.suggest.Suggestions;
import nu.marginalia.functions.domains.DomainInfoGrpcService;
import nu.marginalia.functions.math.MathGrpcService;
import nu.marginalia.livecapture.LiveCaptureGrpcService;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.rss.svc.FeedsGrpcService;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.JoobyService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.List;
public class AssistantService extends SparkService {
public class AssistantService extends JoobyService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get();
@org.jetbrains.annotations.NotNull
private final ScreenshotService screenshotService;
private final Suggestions suggestions;
@Inject
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
List.of(domainInfoGrpcService,
mathGrpcService,
liveCaptureGrpcService,
feedsGrpcService));
feedsGrpcService),
List.of());
this.screenshotService = screenshotService;
this.suggestions = suggestions;
Spark.staticFiles.expireTime(600);
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
Spark.awaitInitialization();
}
private Object getSuggestions(Request request, Response response) {
response.type("application/json");
var param = request.queryParams("partial");
if (param == null) {
public void startJooby(Jooby jooby) {
super.startJooby(jooby);
jooby.get("/suggest/", this::getSuggestions);
jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
}
private String getSuggestions(Context context) {
context.setResponseType("application/json");
var param = context.query("partial");
if (param.isMissing()) {
logger.warn("Bad parameter, partial is null");
Spark.halt(500);
context.setResponseCode(500);
return "{}";
}
return suggestions.getSuggestions(10, param);
}
private String convertToJson(Object o) {
return gson.toJson(o);
return gson.toJson(suggestions.getSuggestions(10, param.value()));
}
}

View File

@@ -0,0 +1,118 @@
package nu.marginalia.assistant;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import io.jooby.Context;
import nu.marginalia.db.DbDomainQueries;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.SQLException;
public class ScreenshotService {
private final DbDomainQueries domainQueries;
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
this.domainQueries = dbDomainQueries;
this.dataSource = dataSource;
}
public boolean hasScreenshot(int domainId) {
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT TRUE
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, domainId);
var rs = ps.executeQuery();
if (rs.next()) {
return rs.getBoolean(1);
}
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
return false;
}
public Object serveScreenshotRequest(Context context) {
if (Strings.isNullOrEmpty(context.path("id").value(""))) {
context.setResponseCode(404);
return "";
}
int id = context.path("id").intValue();
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT CONTENT_TYPE, DATA
FROM DATA_DOMAIN_SCREENSHOT
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
WHERE EC_DOMAIN.ID=?
""")) {
ps.setInt(1, id);
var rsp = ps.executeQuery();
if (rsp.next()) {
context.setResponseType(rsp.getString(1));
context.setResponseCode(200);
context.setResponseHeader("Cache-control", "public,max-age=3600");
try (var rs = context.responseStream()) {
IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
}
return "";
}
}
catch (IOException ex) {
logger.warn("IO error", ex);
}
catch (SQLException ex) {
logger.warn("SQL error", ex);
}
context.setResponseType("image/svg+xml");
var name = domainQueries.getDomain(id).map(Object::toString)
.orElse("[Screenshot Not Yet Captured]");
return """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns="http://www.w3.org/2000/svg"
width="640px"
height="480px"
viewBox="0 0 640 480"
version="1.1">
<g>
<rect
style="fill:#808080"
id="rect288"
width="595.41992"
height="430.01825"
x="23.034981"
y="27.850344" />
<text
xml:space="preserve"
style="font-size:100px;fill:#909090;font-family:sans-serif;"
x="20"
y="120">Placeholder</text>
<text
xml:space="preserve"
style="font-size:32px;fill:#000000;font-family:monospace;"
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
</g>
</svg>
""".formatted(name);
}
}

View File

@@ -160,12 +160,12 @@ dependencyResolutionManagement {
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
library('notnull','org.jetbrains','annotations').version('24.0.0')