1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

17 Commits

Author SHA1 Message Date
Viktor Lofgren
9d3f9adb05 Force redeploy of everything 2025-07-23 13:36:02 +02:00
Viktor
a43a1773f1 Merge pull request #216 from MarginaliaSearch/deprecate-executor
Architecture: Remove the separate executor service and roll it into the index service.
2025-07-23 13:32:42 +02:00
Viktor Lofgren
1e7a3a3c4f (docs) Update docs to reflect the change 2025-07-23 13:18:23 +02:00
Viktor Lofgren
62b696b1c3 (architecture) Remove the separate executor service and merge it into the index service
The primary motivation for this is that in production, the large number of partitioned services has lead to an intermittent exhaustion of available database connections, as each service has a connection pool.

The decision to have a separate executor service dates back from when the index service was very slow to start, and the executor didn't always spin off its memory-hungry tasks into separate processes, which meant the executor would sometimes OOM and crash, and it was undesirable to bring the index down with it.
2025-07-23 12:57:13 +02:00
Viktor Lofgren
f1a900f383 (search) Clean up front page mobile design a bit 2025-07-23 12:20:40 +02:00
Viktor Lofgren
700364b86d (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 15:08:20 +02:00
Viktor Lofgren
7e725ddaed (sample) Remove debug logging
The problem sat in the desk chair all along
2025-07-21 14:41:59 +02:00
Viktor Lofgren
120209e138 (sample) Diagnosing compression errors 2025-07-21 14:34:08 +02:00
Viktor Lofgren
a771a5b6ce (sample) Test different approach to decoding 2025-07-21 14:19:01 +02:00
Viktor Lofgren
dac5b54128 (sample) Better logging for sample errors 2025-07-21 14:03:58 +02:00
Viktor Lofgren
6cfb143c15 (sample) Compress sample HTML data and introduce new API for only getting requests 2025-07-21 13:55:25 +02:00
Viktor Lofgren
23c818281b (converter) Reduce DomSample logging for NOT_FOUND 2025-07-21 13:37:55 +02:00
Viktor Lofgren
8aad253cf6 (converter) Add more logging around dom sample data retrieval errors 2025-07-21 13:26:38 +02:00
Viktor Lofgren
556d7af9dc Reapply "(grpc) Use grpc-netty instead of grpc-netty-shaded"
This reverts commit b7a5219ed3.
2025-07-21 13:23:32 +02:00
Viktor Lofgren
b7a5219ed3 Revert "(grpc) Use grpc-netty instead of grpc-netty-shaded"
Reverting this change to see if it's the cause of some instability issues observed.
2025-07-21 13:10:41 +02:00
Viktor Lofgren
a23ec521fe (converter) Ensure features is mutable on DetailsWithWords as this is assumed later 2025-07-21 12:50:04 +02:00
Viktor Lofgren
fff3babc6d (classier) Add rule for */pixel.gif as likely tracking pixels 2025-07-21 12:35:57 +02:00
41 changed files with 174 additions and 304 deletions

View File

@@ -7,7 +7,6 @@ public enum ServiceId {
Search("search-service"),
Index("index-service"),
Query("query-service"),
Executor("executor-service"),
Control("control-service"),

View File

@@ -189,7 +189,7 @@ public class ExecutorClient {
String uriPath = "/transfer/file/" + fileStorage.id();
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
if (endpoints.isEmpty()) {
throw new RuntimeException("No endpoints for node " + fileStorage.node());
}

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import com.google.inject.Inject;
import nu.marginalia.storage.FileStorageService;

View File

@@ -1,5 +1,5 @@
The execution subsystem is responsible for the execution of long running tasks on each
index node. It lives in the [executor-service](../services-core/executor-service) module.
index node. It lives in the [index-service](../services-core/index-service) module.
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
which permits program state to survive crashes and reboots.

View File

@@ -1,4 +1,4 @@
package nu.marginalia.executor;
package nu.marginalia.svc;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;

View File

@@ -41,7 +41,22 @@ public class DomSampleClient {
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample");
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}
}
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
try {
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
return Optional.of(val);
}
catch (StatusRuntimeException sre) {
if (sre.getStatus() != Status.NOT_FOUND) {
logger.error("Failed to fetch DOM sample", sre);
}
return Optional.empty();
}

View File

@@ -7,6 +7,7 @@ option java_multiple_files=true;
service DomSampleApi {
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
}
@@ -19,10 +20,16 @@ message RpcBooleanRsp {
bool answer = 1;
}
message RpcDomainSampleRequests {
string domainName = 1;
string url = 2;
repeated RpcOutgoingRequest outgoingRequests = 5;
}
message RpcDomainSample {
string domainName = 1;
string url = 2;
string htmlSample = 3;
bytes htmlSampleZstd = 3;
bool accepted_popover = 4;
repeated RpcOutgoingRequest outgoingRequests = 5;
}

View File

@@ -31,6 +31,7 @@ dependencies {
implementation libs.jsoup
implementation libs.opencsv
implementation libs.slop
implementation libs.zstd
implementation libs.sqlite
implementation libs.bundles.slf4j
implementation libs.commons.lang3

View File

@@ -1,6 +1,8 @@
package nu.marginalia.domsample;
import com.github.luben.zstd.Zstd;
import com.google.inject.Inject;
import com.google.protobuf.ByteString;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import nu.marginalia.api.domsample.*;
@@ -9,6 +11,7 @@ import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List;
public class DomSampleGrpcService
@@ -42,7 +45,36 @@ public class DomSampleGrpcService
}
// Grab the first sample
RpcDomainSample.Builder response = convert(dbRecords.getFirst());
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
}
catch (Exception e) {
logger.error("Error in getSample()", e);
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
}
}
@Override
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
String domainName = request.getDomainName();
if (domainName.isBlank()) {
responseObserver.onError(Status.INVALID_ARGUMENT
.withDescription("Invalid domain name")
.asRuntimeException());
return;
}
try {
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
if (dbRecords.isEmpty()) {
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
return;
}
// Grab the first sample
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
responseObserver.onNext(response.build());
responseObserver.onCompleted();
@@ -87,7 +119,7 @@ public class DomSampleGrpcService
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
for (var record : dbRecords) {
responseObserver.onNext(convert(record).build());
responseObserver.onNext(convertFullSample(record).build());
}
responseObserver.onCompleted();
@@ -98,12 +130,14 @@ public class DomSampleGrpcService
}
}
private RpcDomainSample.Builder convert(DomSampleDb.Sample dbSample) {
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
var sampleBuilder = RpcDomainSample.newBuilder()
.setDomainName(dbSample.domain())
.setAcceptedPopover(dbSample.acceptedPopover())
.setHtmlSample(dbSample.sample());
.setHtmlSampleZstd(htmlZstd);
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
@@ -120,4 +154,23 @@ public class DomSampleGrpcService
return sampleBuilder;
}
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
.setDomainName(dbSample.domain());
for (var req : dbSample.parseRequests()) {
sampleBuilder.addOutgoingRequestsBuilder()
.setUrl(req.uri().toString())
.setMethod(switch (req.method().toUpperCase())
{
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
default -> RpcOutgoingRequest.RequestMethod.OTHER;
})
.setTimestamp(req.timestamp());
}
return sampleBuilder;
}
}

View File

@@ -87,7 +87,7 @@ class FeedFetcherServiceTest extends AbstractModule {
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
bind(HikariDataSource.class).toInstance(dataSource);
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
}

View File

@@ -90,6 +90,7 @@ dependencies {
implementation libs.commons.lang3
implementation libs.commons.compress
implementation libs.sqlite
implementation libs.bundles.grpc
implementation libs.bundles.httpcomponents

View File

@@ -22,6 +22,7 @@ dependencies {
implementation libs.bundles.slf4j
implementation libs.guava
implementation libs.zstd
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}

View File

@@ -1,5 +1,6 @@
package nu.marginalia.domclassifier;
import com.github.luben.zstd.ZstdInputStream;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.domsample.RpcDomainSample;
@@ -19,6 +20,7 @@ import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@@ -84,8 +86,9 @@ public class DomSampleClassifier {
EdgeDomain sampleDomain = new EdgeDomain(sample.getDomainName());
try {
var parsedDoc = Jsoup.parse(sample.getHtmlSample());
try (var compressedStream = new ZstdInputStream(sample.getHtmlSampleZstd().newInput())) {
String html = new String(compressedStream.readAllBytes(), StandardCharsets.UTF_8);
var parsedDoc = Jsoup.parse(html);
var fixedElements = parsedDoc.select("*[data-position=fixed]");
if (sample.getAcceptedPopover()) {
@@ -104,7 +107,7 @@ public class DomSampleClassifier {
}
}
catch (Exception ex) {
logger.warn("Error when parsing DOM HTML sample");
logger.warn("Error when parsing DOM HTML sample", ex);
}
// Classify outgoing requests

View File

@@ -15,6 +15,7 @@
<classifier target="url-regex" rule="tracking">/ccm/collect$</classifier>
<classifier target="url-regex" rule="tracking">^/[0-9]+\.js$</classifier>
<classifier target="url-regex" rule="tracking">^/[a-z0-9]\.gif$</classifier>
<classifier target="url-regex" rule="tracking">^/pixel\.gif$</classifier>
<classifier target="url-regex" rule="ads">/pagead/</classifier>
<classifier target="url-regex" rule="ads">/google-ads/</classifier>

View File

@@ -12,4 +12,5 @@ class DDGTrackerDataTest {
data.getDomainInfo("hotjar.com").ifPresent(System.out::println);
data.getAllClassifications().forEach(System.out::println);
}
}

View File

@@ -1,6 +1,8 @@
package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import io.grpc.Status;
import io.grpc.StatusRuntimeException;
import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource;
@@ -98,10 +100,16 @@ public class DomainProcessor {
return domSampleClient
.getSampleAsync(domainName, domSampleExecutor)
.thenApply(domSampleClassifier::classifySample)
.handle((a,b) ->
Objects.requireNonNullElseGet(a,
() -> EnumSet.of(DomSampleClassification.UNCLASSIFIED)))
.get();
.handle((a,b) -> {
if (b != null) {
var cause = b.getCause();
if (!(cause instanceof StatusRuntimeException sre && sre.getStatus() != Status.NOT_FOUND)) {
logger.warn("Exception when fetching sample data", b);
}
return EnumSet.of(DomSampleClassification.UNCLASSIFIED);
}
return a;
}).get();
}
@Nullable

View File

@@ -161,7 +161,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final Set<HtmlFeature> features = featureExtractor.getFeatures(url, doc, documentHeaders, dld);
if (!documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
features.add(HtmlFeature.SHORT_DOCUMENT);
}

View File

@@ -115,7 +115,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.quality = -5;
ret.features = Set.of(HtmlFeature.PDF);
ret.features = new HashSet<>(); // must be mutable!
ret.features.add(HtmlFeature.PDF);
ret.description = getDescription(doc);
ret.hashCode = dld.localitySensitiveHashCode();

View File

@@ -13,13 +13,13 @@ A map of the most important components and how they relate can be found below.
![image](../doc/diagram/conceptual-overview.svg)
The core part of the search engine is the index service, which is responsible for storing and retrieving
the document data. The index serive is partitioned, along with the executor service, which is responsible for executing
processes. At least one instance of each service must be run, but more can be run
the document data. The index service is partitioned and is responsible for both index lookups and spawning
per-partition processing tasks. At least one instance of each service must be run, but more can be run
alongside. Multiple partitions is desirable in production to distribute load across multiple physical drives,
as well as reducing the impact of downtime.
Search queries are delegated via the query service, which is a proxy that fans out the query to all
eligible index services. The control service is responsible for distributing commands to the executor
eligible index services. The control service is responsible for distributing commands to the partitions
service, and for monitoring the health of the system. It also offers a web interface for operating the system.
### Services
@@ -32,7 +32,6 @@ service, and for monitoring the health of the system. It also offers a web inte
* [index](services-core/index-service)
* Exposes the [index](index) subsystem
* Exposes the [functions/link-graph](functions/link-graph) subsystem
* [executor](services-core/executor-service)
* Exposes the [execution](execution) subsystem
* [assistant](services-core/assistant-service)
* Exposes the [functions/math](functions/math) subsystem
@@ -57,7 +56,7 @@ Services that expose HTTP endpoints tend to have more code. They are marked wit
### Processes
Processes are batch jobs that deal with data retrieval, processing and loading. These are spawned and orchestrated by
the executor service, which is controlled by the control service.
the index service, which is controlled by the control service.
* [processes](processes/)
* [crawling-process](processes/crawling-process)

View File

@@ -11,7 +11,7 @@ import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.domsample.DomSampleClient;
import nu.marginalia.api.domsample.RpcDomainSample;
import nu.marginalia.api.domsample.RpcDomainSampleRequests;
import nu.marginalia.api.domsample.RpcOutgoingRequest;
import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed;
@@ -399,7 +399,7 @@ public class SearchSiteInfoService {
return forServiceUnavailable(domainName);
}
Optional<RpcDomainSample> sample = domSampleClient.getSample(domainName.toLowerCase());
Optional<RpcDomainSampleRequests> sample = domSampleClient.getSampleRequests(domainName.toLowerCase());
if (sample.isEmpty()) {
return forNoData(domainName);
}

View File

@@ -38,8 +38,8 @@
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
</div>
</div>
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2 items-center sm:items-stretch">
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
<div class="mx-auto px-8 flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto items-center sm:items-stretch">
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Prioritizes non-commercial content</li>
@@ -48,14 +48,14 @@
</ul>
</div>
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Custom index and crawler software</li>
<li>Simple technology, no AI</li>
<li>AGPL license</li>
</ul>
<div class="flex pt-4 gap-2">
<div class="flex pt-4 gap-2 flex-col md:flex-row">
<div class="text-xs text-liteblue dark:text-blue-200">
<i class="fa-brands fa-github"></i>
<a href="https://git.marginalia.nu/" class="underline">Git Repository</a>
@@ -67,7 +67,7 @@
</div>
</div>
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-96 sm:w-64">
<div class="flex flex-col items-center border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-8 sm:p-4 space-y-3 w-[300px] sm:w-64">
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
<ul class="list-disc ml-8 sm:ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Filter out tracking </li>

View File

@@ -2,7 +2,7 @@ package nu.marginalia.control.node.model;
import nu.marginalia.nodecfg.model.NodeConfiguration;
public record IndexNodeStatus(NodeConfiguration configuration, boolean indexServiceOnline, boolean executorServiceOnline) {
public record IndexNodeStatus(NodeConfiguration configuration, boolean indexServiceOnline) {
public int id() {
return configuration.node();
}

View File

@@ -338,7 +338,7 @@ public class ControlNodeService {
}
private List<EventLogEntry> getEvents(int nodeId) {
List<String> services = List.of(ServiceId.Index.serviceName +":"+nodeId, ServiceId.Executor.serviceName +":"+nodeId);
List<String> services = List.of(ServiceId.Index.serviceName +":"+nodeId);
List<EventLogEntry> events = new ArrayList<>(20);
for (var service :services) {
events.addAll(eventLogService.getLastEntriesForService(service, Long.MAX_VALUE, 10));
@@ -358,8 +358,7 @@ public class ControlNodeService {
public IndexNodeStatus getStatus(NodeConfiguration config) {
return new IndexNodeStatus(config,
monitors.isServiceUp(ServiceId.Index, config.node()),
monitors.isServiceUp(ServiceId.Executor, config.node())
monitors.isServiceUp(ServiceId.Index, config.node())
);
}

View File

@@ -2,7 +2,7 @@
<h2>Nodes</h2>
<table class="table">
<tr>
<th>Node</th><th>Profile</th><th>Queries</th><th>Enabled</th><th>Index</th><th>Executor</th>
<th>Node</th><th>Profile</th><th>Queries</th><th>Enabled</th><th>Index</th>
</tr>
{{#each .}}
<tr>
@@ -24,9 +24,6 @@
</td>
{{#if indexServiceOnline}}<td>Online</td>{{/if}}
{{#unless indexServiceOnline}}<td class="table-danger">Offline</td>{{/unless}}
{{#if executorServiceOnline}}<td>Online</td>{{/if}}
{{#unless executorServiceOnline}}<td class="table-warning">Offline</td>{{/unless}}
</tr>
{{/each}}
</table>

View File

@@ -1,100 +0,0 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.5'
}
application {
mainClass = 'nu.marginalia.executor.ExecutorMain'
applicationName = 'executor-service'
}
tasks.distZip.enabled = false
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle"
dependencies {
// These look weird but they're needed to be able to spawn the processes
// from the executor service
implementation project(':code:processes:crawling-process')
implementation project(':code:processes:loading-process')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:index-constructor-process')
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation project(':code:common:linkdb')
implementation project(':code:common:service')
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:favicon')
implementation project(':code:functions:favicon:api')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:index:index-journal')
implementation project(':code:index:api')
implementation project(':code:processes:process-mq-api')
implementation project(':code:execution')
implementation project(':code:execution:api')
implementation project(':third-party:encyclopedia-marginalia-nu')
implementation libs.bundles.slf4j
implementation dependencies.create(libs.spark.get()) {
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jetty
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
implementation libs.gson
implementation libs.prometheus
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.trove
implementation libs.zstd
implementation libs.jsoup
implementation libs.commons.io
implementation libs.commons.compress
implementation libs.commons.lang3
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -1,45 +0,0 @@
package nu.marginalia.executor;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.NodeStatusWatcher;
public class ExecutorMain extends MainClass {
private final ExecutorSvc service;
@Inject
public ExecutorMain(ExecutorSvc service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceId.Executor, args);
Injector injector = Guice.createInjector(
new ExecutorModule(),
new DatabaseModule(false),
new NsfwFilterModule(),
new ServiceDiscoveryModule(),
new ServiceConfigurationModule(ServiceId.Executor)
);
// Orchestrate the boot order for the services
var registry = injector.getInstance(ServiceRegistryIf.class);
var configuration = injector.getInstance(ServiceConfiguration.class);
orchestrateBoot(registry, configuration);
injector.getInstance(NodeStatusWatcher.class);
injector.getInstance(ExecutorMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@@ -1,8 +0,0 @@
package nu.marginalia.executor;
import com.google.inject.AbstractModule;
public class ExecutorModule extends AbstractModule {
public void configure() {
}
}

View File

@@ -1,55 +0,0 @@
package nu.marginalia.executor;
import com.google.inject.Inject;
import nu.marginalia.execution.*;
import nu.marginalia.functions.favicon.FaviconGrpcService;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.mq.MqRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Spark;
import java.util.List;
// Weird name for this one to not have clashes with java.util.concurrent.ExecutorService
public class ExecutorSvc extends SparkService {
private static final Logger logger = LoggerFactory.getLogger(ExecutorSvc.class);
private final ExecutionInit executionInit;
@Inject
public ExecutorSvc(BaseServiceParams params,
ExecutorGrpcService executorGrpcService,
ExecutorCrawlGrpcService executorCrawlGrpcService,
ExecutorSideloadGrpcService executorSideloadGrpcService,
ExecutorExportGrpcService executorExportGrpcService,
FaviconGrpcService faviconGrpcService,
ExecutionInit executionInit,
ExecutorFileTransferService fileTransferService) throws Exception {
super(params,
ServicePartition.partition(params.configuration.node()),
List.of(executorGrpcService,
executorCrawlGrpcService,
executorSideloadGrpcService,
executorExportGrpcService,
faviconGrpcService)
);
this.executionInit = executionInit;
Spark.get("/transfer/file/:fid", fileTransferService::transferFile);
Spark.head("/transfer/file/:fid", fileTransferService::transferFile);
}
@MqRequest(endpoint="FIRST-BOOT")
public void setUpDefaultActors(String message) throws Exception {
logger.info("Initializing default actors");
executionInit.initDefaultActors();
}
}

View File

@@ -1,10 +0,0 @@
The executor service is a partitioned service responsible for executing and keeping
track of long-running maintenance and operational tasks, such as crawling or data
processing.
The executor service is closely linked to the [control-service](../control-service),
which provides a user interface for much of the executor's functionality.
The service it itself relatively bare of code, but imports and exposes the [execution subsystem](../../execution),
which is responsible for the actual execution of tasks.

View File

@@ -30,10 +30,16 @@ dependencies {
implementation project(':code:common:db')
implementation project(':code:common:linkdb')
implementation project(':code:execution')
implementation project(':code:execution:api')
implementation project(':code:functions:favicon')
implementation project(':code:functions:favicon:api')
implementation project(':code:index')
implementation project(':code:functions:link-graph:partition')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:search-query:api')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:index:api')
testImplementation project(path: ':code:services-core:control-service')

View File

@@ -3,13 +3,14 @@ package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.NodeStatusWatcher;
@@ -28,6 +29,7 @@ public class IndexMain extends MainClass {
new IndexModule(),
new DatabaseModule(false),
new ServiceDiscoveryModule(),
new NsfwFilterModule(),
new ServiceConfigurationModule(ServiceId.Index)
);

View File

@@ -2,6 +2,8 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.execution.*;
import nu.marginalia.functions.favicon.FaviconGrpcService;
import nu.marginalia.index.api.IndexMqEndpoints;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.linkdb.docs.DocumentDbReader;
@@ -14,9 +16,11 @@ import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.mq.MqRequest;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.svc.ExecutorFileTransferService;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Spark;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -38,6 +42,7 @@ public class IndexService extends SparkService {
private final DomainLinks domainLinks;
private final ServiceEventLog eventLog;
private final ExecutionInit executionInit;
@Inject
public IndexService(BaseServiceParams params,
@@ -48,13 +53,25 @@ public class IndexService extends SparkService {
DocumentDbReader documentDbReader,
DomainLinks domainLinks,
PartitionLinkGraphService partitionLinkGraphService,
ExecutorGrpcService executorGrpcService,
ExecutorCrawlGrpcService executorCrawlGrpcService,
ExecutorSideloadGrpcService executorSideloadGrpcService,
ExecutorExportGrpcService executorExportGrpcService,
FaviconGrpcService faviconGrpcService,
ExecutionInit executionInit,
ExecutorFileTransferService fileTransferService,
ServiceEventLog eventLog)
throws Exception
{
super(params,
ServicePartition.partition(params.configuration.node()),
List.of(indexQueryService,
partitionLinkGraphService)
partitionLinkGraphService,
executorGrpcService,
executorCrawlGrpcService,
executorSideloadGrpcService,
executorExportGrpcService,
faviconGrpcService)
);
this.opsService = opsService;
@@ -62,15 +79,26 @@ public class IndexService extends SparkService {
this.fileStorageService = fileStorageService;
this.documentDbReader = documentDbReader;
this.domainLinks = domainLinks;
this.executionInit = executionInit;
this.eventLog = eventLog;
this.init = params.initialization;
Spark.get("/transfer/file/:fid", fileTransferService::transferFile);
Spark.head("/transfer/file/:fid", fileTransferService::transferFile);
Thread.ofPlatform().name("initialize-index").start(this::initialize);
}
volatile boolean initialized = false;
@MqRequest(endpoint="FIRST-BOOT")
public void setUpDefaultActors(String message) throws Exception {
logger.info("Initializing default actors");
executionInit.initDefaultActors();
}
@MqRequest(endpoint = IndexMqEndpoints.INDEX_RERANK)
public String rerank(String message) {
if (!opsService.rerank()) {

View File

@@ -24,7 +24,6 @@ dependencies {
implementation project(':code:services-core:query-service')
implementation project(':code:services-core:index-service')
implementation project(':code:services-core:control-service')
implementation project(':code:services-core:executor-service')
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@@ -12,3 +12,5 @@
2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.
2025-07-21: Deploy executor partition 1.
2025-07-21: Deploy search.
2025-07-23: Redeploy all.

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 60 KiB

After

Width:  |  Height:  |  Size: 60 KiB

View File

@@ -40,11 +40,6 @@ services:
<<: *partition-1
image: "marginalia/index-service"
container_name: "index-service-1"
executor-service-1:
<<: *partition-1
restart: always
image: "marginalia/executor-service"
container_name: "executor-service-1"
query-service:
<<: *service
image: "marginalia/query-service"

View File

@@ -62,20 +62,10 @@ services:
<<: *partition-1
image: "marginalia/index-service"
container_name: "index-service-1"
executor-service-1:
<<: *partition-1
restart: always
image: "marginalia/executor-service"
container_name: "executor-service-1"
index-service-2:
<<: *partition-2
image: "marginalia/index-service"
container_name: "index-service-2"
executor-service-2:
<<: *partition-2
restart: always
image: "marginalia/executor-service"
container_name: "executor-service-2"
query-service:
<<: *service
image: "marginalia/query-service"

View File

@@ -62,20 +62,10 @@ services:
<<: *partition-1
image: "marginalia/index-service"
container_name: "index-service-1"
executor-service-1:
<<: *partition-1
restart: always
image: "marginalia/executor-service"
container_name: "executor-service-1"
index-service-2:
<<: *partition-2
image: "marginalia/index-service"
container_name: "index-service-2"
executor-service-2:
<<: *partition-2
restart: always
image: "marginalia/executor-service"
container_name: "executor-service-2"
query-service:
<<: *service
image: "marginalia/query-service"

View File

@@ -31,13 +31,11 @@ A working setup needs at all the services
* control [ http port is the control GUI ]
* query [ http port is the query GUI ]
* index [ http port is internal ]
* executor [ http port is internal ]
Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service.
* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition
number is the last digit of the service name, and should be positive. You can have multiple pairs of index
and executor partitions, but the pair should run on the same physical machine with the same install directory.
* An index service should exist on the same partition e.g. index:1. The partition
number is the last digit of the service name, and should be positive.
* The query service can use any partition number.

View File

@@ -4,7 +4,6 @@ include 'code:services-core:index-service'
include 'code:services-core:assistant-service'
include 'code:services-core:control-service'
include 'code:services-core:query-service'
include 'code:services-core:executor-service'
include 'code:services-core:single-service-runner'
include 'code:services-application:search-service'

View File

@@ -76,13 +76,6 @@ SERVICE_CONFIG = {
deploy_tier=3,
groups={"all", "index"}
),
'executor': ServiceConfig(
gradle_target=':code:services-core:executor-service:docker',
docker_name='executor-service',
instances=10,
deploy_tier=3,
groups={"all", "executor"}
),
'control': ServiceConfig(
gradle_target=':code:services-core:control-service:docker',
docker_name='control-service',