1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

10 Commits

Author SHA1 Message Date
Viktor
24022c5adc Merge pull request #203 from MarginaliaSearch/nsfw-domain-lists
Nsfw blocking via UT1 domain lists
2025-06-07 13:24:05 +02:00
Viktor Lofgren
1de9ecc0b6 (nsfw) Add metrics to the filtering so we can monitor it 2025-06-07 13:17:05 +02:00
Viktor Lofgren
9b80245ea0 (nsfw) Move filtering to the IndexApiClient, and add filtering options to the internal APIs and public API. 2025-06-07 12:54:20 +02:00
Viktor Lofgren
4e1595c1a6 (nsfw) Initial work on adding UT1-based domain filtering 2025-06-06 14:23:37 +02:00
Viktor Lofgren
0be8585fa5 Add tag format hint to deploy script 2025-06-06 10:03:18 +02:00
Viktor Lofgren
a0fe070fe7 Redeploy browserless and assistant. 2025-06-06 09:51:39 +02:00
Viktor Lofgren
abe9da0fc6 (search) Ensure the new search UI sets the correct content-type for opensearch.xml 2025-05-29 12:44:55 +02:00
Viktor Lofgren
56d0128b0a (dom-sample) Remove redundant code 2025-05-28 17:43:46 +02:00
Viktor Lofgren
840b68ac55 (dom-sample) Minor cleanups 2025-05-28 16:27:27 +02:00
Viktor Lofgren
c34ff6d6c3 (dom-sample) Use WAL journal for dom sample db 2025-05-28 16:16:28 +02:00
40 changed files with 619 additions and 100 deletions

View File

@@ -0,0 +1,5 @@
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
ID INT NOT NULL AUTO_INCREMENT,
TIER INT NOT NULL,
PRIMARY KEY (ID)
);

View File

@@ -37,6 +37,7 @@ dependencies {
implementation project(':code:functions:link-graph:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api') implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:search-query') implementation project(':code:functions:search-query')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:execution:api') implementation project(':code:execution:api')
implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:model')

View File

@@ -6,6 +6,7 @@ import java.util.Set;
public enum ExecutorActor { public enum ExecutorActor {
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED), RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
@@ -35,7 +36,8 @@ public enum ExecutorActor {
LIVE_CRAWL(NodeProfile.REALTIME), LIVE_CRAWL(NodeProfile.REALTIME),
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME), PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
SCRAPE_FEEDS(NodeProfile.REALTIME), SCRAPE_FEEDS(NodeProfile.REALTIME),
UPDATE_RSS(NodeProfile.REALTIME); UPDATE_RSS(NodeProfile.REALTIME)
;
public String id() { public String id() {
return "fsm:" + name().toLowerCase(); return "fsm:" + name().toLowerCase();

View File

@@ -68,6 +68,7 @@ public class ExecutorActorControlService {
ExecutorActorStateMachines stateMachines, ExecutorActorStateMachines stateMachines,
MigrateCrawlDataActor migrateCrawlDataActor, MigrateCrawlDataActor migrateCrawlDataActor,
ExportAllPrecessionActor exportAllPrecessionActor, ExportAllPrecessionActor exportAllPrecessionActor,
UpdateNsfwFiltersActor updateNsfwFiltersActor,
UpdateRssActor updateRssActor) throws SQLException { UpdateRssActor updateRssActor) throws SQLException {
this.messageQueueFactory = messageQueueFactory; this.messageQueueFactory = messageQueueFactory;
this.eventLog = baseServiceParams.eventLog; this.eventLog = baseServiceParams.eventLog;
@@ -109,6 +110,7 @@ public class ExecutorActorControlService {
register(ExecutorActor.UPDATE_RSS, updateRssActor); register(ExecutorActor.UPDATE_RSS, updateRssActor);
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor); register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
if (serviceConfiguration.node() == 1) { if (serviceConfiguration.node() == 1) {
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor); register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);

View File

@@ -0,0 +1,53 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.module.ServiceConfiguration;
@Singleton
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
private final ServiceConfiguration serviceConfiguration;
private final NsfwDomainFilter nsfwDomainFilter;
public record Initial() implements ActorStep {}
public record Run() implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Initial() -> {
if (serviceConfiguration.node() != 1) {
yield new Error("This actor can only run on node 1");
}
else {
yield new Run();
}
}
case Run() -> {
nsfwDomainFilter.fetchLists();
yield new End();
}
default -> new Error();
};
}
@Override
public String describe() {
return "Sync NSFW filters";
}
@Inject
public UpdateNsfwFiltersActor(Gson gson,
ServiceConfiguration serviceConfiguration,
NsfwDomainFilter nsfwDomainFilter)
{
super(gson);
this.serviceConfiguration = serviceConfiguration;
this.nsfwDomainFilter = nsfwDomainFilter;
}
}

View File

@@ -12,6 +12,7 @@ import org.slf4j.LoggerFactory;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.time.Duration;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -109,8 +110,7 @@ public class DomSampleService {
private void updateDomain(BrowserlessClient client, String domain) { private void updateDomain(BrowserlessClient client, String domain) {
var rootUrl = "https://" + domain + "/"; var rootUrl = "https://" + domain + "/";
try { try {
var content = client.annotatedContent(rootUrl, var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
BrowserlessClient.GotoOptions.defaultValues());
if (content.isPresent()) { if (content.isPresent()) {
db.saveSample(domain, rootUrl, content.get()); db.saveSample(domain, rootUrl, content.get());

View File

@@ -26,7 +26,9 @@ public class DomSampleDb implements AutoCloseable {
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)"); stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
stmt.execute("PRAGMA journal_mode=WAL");
} }
} }
public void syncDomains(Set<String> domains) { public void syncDomains(Set<String> domains) {
@@ -151,8 +153,6 @@ public class DomSampleDb implements AutoCloseable {
} }
record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException { public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
try (var stmt = connection.prepareStatement(""" try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INSERT OR REPLACE

View File

@@ -141,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {
public record GotoOptions(String waitUntil, long timeout) { public record GotoOptions(String waitUntil, long timeout) {
public static GotoOptions defaultValues() { public static GotoOptions defaultValues() {
return new GotoOptions("load", Duration.ofSeconds(10).toMillis()); return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
} }
} }

View File

@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
} }
else { else {
EdgeDomain domain = domainNameOpt.get(); EdgeDomain domain = domainNameOpt.get();
String domainNameStr = domain.toString();
if (!isValidDomainForCapture(domain)) { if (!isValidDomainForCapture(domain)) {
ScreenshotDbOperations.flagDomainAsFetched(conn, domain); ScreenshotDbOperations.flagDomainAsFetched(conn, domain);

View File

@@ -108,7 +108,7 @@ public class BrowserlessClientTest {
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db")) DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
) { ) {
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow(); var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content); dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
System.out.println(content); System.out.println(content);
Assertions.assertFalse(content.isBlank(), "Content should not be empty"); Assertions.assertFalse(content.isBlank(), "Content should not be empty");

View File

@@ -0,0 +1,43 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.guava
implementation libs.commons.lang3
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.fastutil
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation project(':code:common:service')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -0,0 +1,192 @@
package nu.marginalia.nsfw;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
@Singleton
public class NsfwDomainFilter {
private final HikariDataSource dataSource;
private final List<String> dangerLists;
private final List<String> smutLists;
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
public static final int NSFW_DISABLE = 0;
public static final int NSFW_BLOCK_DANGER = 1;
public static final int NSFW_BLOCK_SMUT = 2;
@Inject
public NsfwDomainFilter(HikariDataSource dataSource,
@Named("nsfw.dangerLists") List<String> dangerLists,
@Named("nsfw.smutLists") List<String> smutLists
) {
this.dataSource = dataSource;
this.dangerLists = dangerLists;
this.smutLists = smutLists;
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
while (true) {
sync();
try {
TimeUnit.HOURS.sleep(1);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break; // Exit the loop if interrupted
}
}
});
}
public boolean isBlocked(int domainId, int tier) {
if (tier == 0)
return false;
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
return true;
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
return true;
return false;
}
private synchronized void sync() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
) {
var rs = stmt.executeQuery();
IntOpenHashSet tier1 = new IntOpenHashSet();
IntOpenHashSet tier2 = new IntOpenHashSet();
while (rs.next()) {
int domainId = rs.getInt("ID");
int tier = rs.getInt("TIER");
switch (tier) {
case 1 -> tier1.add(domainId);
case 2 -> tier2.add(domainId);
}
}
this.blockedDomainIdsTier1 = tier1;
this.blockedDomainIdsTier2 = tier2;
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
}
catch (SQLException ex) {
logger.error("Failed to sync NSFW domain filter", ex);
}
}
public synchronized void fetchLists() {
try (var conn = dataSource.getConnection();
HttpClient client = HttpClient.newBuilder()
.followRedirects(HttpClient.Redirect.ALWAYS)
.build();
var stmt = conn.createStatement();
var insertStmt = conn.prepareStatement("INSERT INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
List<String> combinedDangerList = new ArrayList<>(10_000);
for (var dangerListUrl : dangerLists) {
combinedDangerList.addAll(fetchList(client, dangerListUrl));
}
for (String domain : combinedDangerList) {
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
insertStmt.setString(2, domain);
insertStmt.execute();
}
List<String> combinedSmutList = new ArrayList<>(10_000);
for (var smutListUrl : smutLists) {
combinedSmutList.addAll(fetchList(client, smutListUrl));
}
for (String domain : combinedSmutList) {
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
insertStmt.setString(2, domain);
insertStmt.addBatch();
insertStmt.execute();
}
stmt.execute("""
DROP TABLE IF EXISTS NSFW_DOMAINS
""");
stmt.execute("""
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
""");
sync();
}
catch (SQLException ex) {
logger.error("Failed to fetch NSFW domain lists", ex);
}
}
public List<String> fetchList(HttpClient client, String url) {
logger.info("Fetching NSFW domain list from {}", url);
var request = HttpRequest.newBuilder()
.uri(java.net.URI.create(url))
.build();
try {
if (url.endsWith(".gz")) {
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
byte[] body = response.body();
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
return reader.lines()
.filter(StringUtils::isNotEmpty)
.toList();
} catch (Exception e) {
logger.error("Error reading GZIP response from {}", url, e);
}
} else {
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() == 200) {
return Arrays.stream(StringUtils.split(response.body(), "\n"))
.filter(StringUtils::isNotEmpty)
.toList();
} else {
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
}
}
}
catch (Exception e) {
logger.error("Error fetching NSFW domain list from {}", url, e);
}
return List.of();
}
}

View File

@@ -0,0 +1,30 @@
package nu.marginalia.nsfw;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import jakarta.inject.Named;
import java.util.List;
public class NsfwFilterModule extends AbstractModule {
@Provides
@Named("nsfw.dangerLists")
public List<String> nsfwDomainLists1() {
return List.of(
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
);
}
@Provides
@Named("nsfw.smutLists")
public List<String> nsfwDomainLists2() {
return List.of(
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
);
}
public void configure() {}
}

View File

@@ -0,0 +1,108 @@
package nu.marginalia.nsfw;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.inject.Provides;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import jakarta.inject.Named;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow")
@Testcontainers
class NsfwDomainFilterTest extends AbstractModule {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
static Path tempDir;
@BeforeAll
public static void setUpDb() throws IOException {
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
System.setProperty("system.homePath", tempDir.toString());
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
) {
// Ensure the database is ready
conn.createStatement().execute("SELECT 1");
stmt.setString(1, "www.google.com");
stmt.setString(2, "google.com");
stmt.executeUpdate();
stmt.setString(1, "www.bing.com");
stmt.setString(2, "bing.com");
stmt.executeUpdate();
} catch (Exception e) {
throw new RuntimeException("Failed to connect to the database", e);
}
}
@Provides
@Named("nsfw.dangerLists")
public List<String> nsfwDomainLists1() {
return List.of(
"https://downloads.marginalia.nu/test/list1"
);
}
@Provides
@Named("nsfw.smutLists")
public List<String> nsfwDomainLists2() {
return List.of(
"https://downloads.marginalia.nu/test/list2.gz"
);
}
public void configure() {
bind(HikariDataSource.class).toInstance(dataSource);
}
@Test
public void test() {
var filter = Guice
.createInjector(this)
.getInstance(NsfwDomainFilter.class);
filter.fetchLists();
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
}
}

View File

@@ -1,9 +1,6 @@
package nu.marginalia.api.searchquery; package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters; import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(request.getHumanQuery()); builder.setHumanQuery(request.getHumanQuery());
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality)); builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year)); builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size)); builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(humanQuery); builder.setHumanQuery(humanQuery);
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality)); builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year)); builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size)); builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
request.getSearchSetIdentifier(), request.getSearchSetIdentifier(),
QueryStrategy.valueOf(request.getQueryStrategy()), QueryStrategy.valueOf(request.getQueryStrategy()),
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()), RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
request.getPagination().getPage() request.getPagination().getPage()
); );
} }
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank())) .setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
.setSearchSetIdentifier(params.identifier()) .setSearchSetIdentifier(params.identifier())
.setQueryStrategy(params.queryStrategy().name()) .setQueryStrategy(params.queryStrategy().name())
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
.setTemporalBias(RpcTemporalBias.newBuilder() .setTemporalBias(RpcTemporalBias.newBuilder()
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name())) .setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
.build()) .build())

View File

@@ -0,0 +1,26 @@
package nu.marginalia.api.searchquery.model.query;
public enum NsfwFilterTier {
OFF(0),
DANGER(1),
PORN_AND_GAMBLING(2);
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
NsfwFilterTier(int codedValue) {
this.codedValue = codedValue;
}
public static NsfwFilterTier fromCodedValue(int codedValue) {
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
if (tier.codedValue == codedValue) {
return tier;
}
}
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
}
public int getCodedValue() {
return codedValue;
}
}

View File

@@ -25,10 +25,11 @@ public record QueryParams(
String identifier, String identifier,
QueryStrategy queryStrategy, QueryStrategy queryStrategy,
RpcTemporalBias.Bias temporalBias, RpcTemporalBias.Bias temporalBias,
NsfwFilterTier filterTier,
int page int page
) )
{ {
public QueryParams(String query, RpcQueryLimits limits, String identifier) { public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
this(query, null, this(query, null,
List.of(), List.of(),
List.of(), List.of(),
@@ -43,6 +44,7 @@ public record QueryParams(
identifier, identifier,
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
filterTier,
1 // page 1 // page
); );
} }

View File

@@ -32,6 +32,14 @@ message RpcQsQuery {
RpcTemporalBias temporalBias = 16; RpcTemporalBias temporalBias = 16;
RpcQsQueryPagination pagination = 17; RpcQsQueryPagination pagination = 17;
NSFW_FILTER_TIER nsfwFilterTier = 18;
enum NSFW_FILTER_TIER {
NONE = 0;
DANGER = 1;
PORN_AND_GAMBLING = 2;
};
} }
/* Query service query response */ /* Query service query response */
@@ -78,8 +86,17 @@ message RpcIndexQuery {
RpcQueryLimits queryLimits = 10; RpcQueryLimits queryLimits = 10;
string queryStrategy = 11; // Named query configuration string queryStrategy = 11; // Named query configuration
RpcResultRankingParameters parameters = 12; RpcResultRankingParameters parameters = 12;
NSFW_FILTER_TIER nsfwFilterTier = 13;
enum NSFW_FILTER_TIER {
NONE = 0;
DANGER = 1;
PORN_AND_GAMBLING = 2;
};
} }
/* A tagged union encoding some limit on a field */ /* A tagged union encoding some limit on a field */
message RpcSpecLimit { message RpcSpecLimit {
int32 value = 1; int32 value = 1;

View File

@@ -19,6 +19,7 @@ dependencies {
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')
implementation project(':code:index:query') implementation project(':code:index:query')

View File

@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters; import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.index.api.IndexClient; import nu.marginalia.index.api.IndexClient;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.server.DiscoverableService; import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@@ -34,13 +35,16 @@ public class QueryGRPCService
private final QueryFactory queryFactory; private final QueryFactory queryFactory;
private final NsfwDomainFilter nsfwDomainFilter;
private final IndexClient indexClient; private final IndexClient indexClient;
@Inject @Inject
public QueryGRPCService(QueryFactory queryFactory, public QueryGRPCService(QueryFactory queryFactory,
NsfwDomainFilter nsfwDomainFilter,
IndexClient indexClient) IndexClient indexClient)
{ {
this.queryFactory = queryFactory; this.queryFactory = queryFactory;
this.nsfwDomainFilter = nsfwDomainFilter;
this.indexClient = indexClient; this.indexClient = indexClient;
} }

View File

@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.functions.searchquery.QueryFactory;
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
"NONE", "NONE",
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
0), null).specs; 0), null).specs;
} }

View File

@@ -17,6 +17,7 @@ dependencies {
implementation project(':code:common:service') implementation project(':code:common:service')
implementation project(':code:common:db') implementation project(':code:common:db')
implementation project(':code:libraries:message-queue') implementation project(':code:libraries:message-queue')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import io.prometheus.client.Counter;
import nu.marginalia.api.searchquery.IndexApiGrpc; import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem; import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery; import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.db.DomainBlacklistImpl; import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcMultiNodeChannelPool; import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
@@ -28,14 +30,26 @@ public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class); private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool; private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist; private final DomainBlacklistImpl blacklist;
private final NsfwDomainFilter nsfwDomainFilter;
Counter wmsa_index_query_count = Counter.build()
.name("wmsa_nsfw_filter_result_count")
.labelNames("tier")
.help("Count of results filtered by NSFW tier")
.register();
private static final ExecutorService executor = Executors.newCachedThreadPool(); private static final ExecutorService executor = Executors.newCachedThreadPool();
@Inject @Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) { public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
DomainBlacklistImpl blacklist,
NsfwDomainFilter nsfwDomainFilter
) {
this.channelPool = channelPoolFactory.createMulti( this.channelPool = channelPoolFactory.createMulti(
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()), ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
IndexApiGrpc::newBlockingStub); IndexApiGrpc::newBlockingStub);
this.blacklist = blacklist; this.blacklist = blacklist;
this.nsfwDomainFilter = nsfwDomainFilter;
} }
private static final Comparator<RpcDecoratedResultItem> comparator = private static final Comparator<RpcDecoratedResultItem> comparator =
@@ -52,7 +66,7 @@ public class IndexClient {
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) { public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal(); final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
int filterTier = indexRequest.getNsfwFilterTierValue();
AtomicInteger totalNumResults = new AtomicInteger(0); AtomicInteger totalNumResults = new AtomicInteger(0);
List<RpcDecoratedResultItem> results = List<RpcDecoratedResultItem> results =
@@ -74,7 +88,7 @@ public class IndexClient {
} }
}) })
.flatMap(List::stream) .flatMap(List::stream)
.filter(item -> !isBlacklisted(item)) .filter(item -> !isBlacklisted(item, filterTier))
.sorted(comparator) .sorted(comparator)
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize)) .skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
.limit(pagination.pageSize) .limit(pagination.pageSize)
@@ -83,8 +97,23 @@ public class IndexClient {
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get()); return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
} }
private boolean isBlacklisted(RpcDecoratedResultItem item) { static String[] tierNames = {
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId())); "OFF",
"DANGER",
"NSFW"
};
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
if (blacklist.isBlacklisted(domainId)) {
return true;
}
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
return true;
}
return false;
} }
} }

View File

@@ -7,6 +7,7 @@ import nu.marginalia.api.model.ApiSearchResultQueryDetails;
import nu.marginalia.api.model.ApiSearchResults; import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
@@ -29,9 +30,10 @@ public class ApiSearchOperator {
public ApiSearchResults query(String query, public ApiSearchResults query(String query,
int count, int count,
int index) int index,
NsfwFilterTier filterTier)
{ {
var rsp = queryClient.search(createParams(query, count, index)); var rsp = queryClient.search(createParams(query, count, index, filterTier));
return new ApiSearchResults("RESTRICTED", query, return new ApiSearchResults("RESTRICTED", query,
rsp.results() rsp.results()
@@ -42,7 +44,7 @@ public class ApiSearchOperator {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
private QueryParams createParams(String query, int count, int index) { private QueryParams createParams(String query, int count, int index, NsfwFilterTier filterTirer) {
SearchSetIdentifier searchSet = selectSearchSet(index); SearchSetIdentifier searchSet = selectSearchSet(index);
return new QueryParams( return new QueryParams(
@@ -53,7 +55,8 @@ public class ApiSearchOperator {
.setTimeoutMs(150) .setTimeoutMs(150)
.setFetchSize(8192) .setFetchSize(8192)
.build(), .build(),
searchSet.name()); searchSet.name(),
filterTirer);
} }
private SearchSetIdentifier selectSearchSet(int index) { private SearchSetIdentifier selectSearchSet(int index) {

View File

@@ -6,6 +6,7 @@ import io.prometheus.client.Counter;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.api.model.ApiLicense; import nu.marginalia.api.model.ApiLicense;
import nu.marginalia.api.model.ApiSearchResults; import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.svc.LicenseService; import nu.marginalia.api.svc.LicenseService;
import nu.marginalia.api.svc.RateLimiterService; import nu.marginalia.api.svc.RateLimiterService;
import nu.marginalia.api.svc.ResponseCache; import nu.marginalia.api.svc.ResponseCache;
@@ -119,6 +120,16 @@ public class ApiService extends SparkService {
int count = intParam(request, "count", 20); int count = intParam(request, "count", 20);
int index = intParam(request, "index", 3); int index = intParam(request, "index", 3);
int nsfw = intParam(request, "nsfw", 1);
NsfwFilterTier nsfwFilterTier;
try {
nsfwFilterTier = NsfwFilterTier.fromCodedValue(nsfw);
}
catch (IllegalArgumentException e) {
Spark.halt(400, "Invalid nsfw parameter value");
return null; // Unreachable, but required to satisfy the compiler
}
logger.info(queryMarker, "{} Search {}", license.key, query); logger.info(queryMarker, "{} Search {}", license.key, query);
@@ -126,7 +137,7 @@ public class ApiService extends SparkService {
.labels(license.key) .labels(license.key)
.time(() -> .time(() ->
searchOperator searchOperator
.query(query, count, index) .query(query, count, index, nsfwFilterTier)
.withLicense(license.getLicense()) .withLicense(license.getLicense())
); );
} }

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
@@ -52,6 +53,7 @@ public class SearchQueryParamFactory {
profile.searchSetIdentifier.name(), profile.searchSetIdentifier.name(),
userParams.strategy(), userParams.strategy(),
userParams.temporalBias(), userParams.temporalBias(),
userParams.filterTier(),
userParams.page() userParams.page()
); );
@@ -78,6 +80,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
1 1
); );
} }
@@ -98,6 +101,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1 1
); );
} }
@@ -118,6 +122,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1 1
); );
} }

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl; import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.model.SearchProfile; import nu.marginalia.search.model.SearchProfile;
@@ -23,6 +24,10 @@ public record SearchParameters(String query,
int page int page
) { ) {
public NsfwFilterTier filterTier() {
return NsfwFilterTier.DANGER;
}
public SearchParameters(String queryString, Request request) { public SearchParameters(String queryString, Request request) {
this( this(
queryString, queryString,

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
@@ -53,6 +54,7 @@ public class SearchQueryParamFactory {
profile.searchSetIdentifier.name(), profile.searchSetIdentifier.name(),
userParams.strategy(), userParams.strategy(),
userParams.temporalBias(), userParams.temporalBias(),
userParams.filterTier(),
userParams.page() userParams.page()
); );
@@ -79,6 +81,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
page page
); );
} }
@@ -99,6 +102,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
page page
); );
} }
@@ -119,6 +123,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(), SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO, QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE, RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1 1
); );
} }

View File

@@ -18,6 +18,7 @@ import nu.marginalia.service.server.JoobyService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
@@ -41,6 +42,8 @@ public class SearchService extends JoobyService {
.help("Search service error count") .help("Search service error count")
.register(); .register();
private final String openSearchXML;
@Inject @Inject
public SearchService(BaseServiceParams params, public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl, WebsiteUrl websiteUrl,
@@ -69,6 +72,13 @@ public class SearchService extends JoobyService {
this.siteSubscriptionService = siteSubscriptionService; this.siteSubscriptionService = siteSubscriptionService;
this.faviconClient = faviconClient; this.faviconClient = faviconClient;
this.domainQueries = domainQueries; this.domainQueries = domainQueries;
try (var is = ClassLoader.getSystemResourceAsStream("static/opensearch.xml")) {
openSearchXML = new String(is.readAllBytes(), StandardCharsets.UTF_8);
}
catch (Exception e) {
throw new RuntimeException("Failed to load OpenSearch XML", e);
}
} }
@Override @Override
@@ -82,6 +92,11 @@ public class SearchService extends JoobyService {
jooby.get("/site/https://*", this::handleSiteUrlRedirect); jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect); jooby.get("/site/http://*", this::handleSiteUrlRedirect);
jooby.get("/opensearch.xml", ctx -> {
ctx.setResponseType(MediaType.valueOf("application/opensearchdescription+xml"));
return openSearchXML;
});
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>"; String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
jooby.get("/site/{domain}/favicon", ctx -> { jooby.get("/site/{domain}/favicon", ctx -> {
String domain = ctx.path("domain").value(); String domain = ctx.path("domain").value();

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl; import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@@ -24,6 +25,10 @@ public record SearchParameters(WebsiteUrl url,
int page int page
) { ) {
public NsfwFilterTier filterTier() {
return NsfwFilterTier.DANGER;
}
public static SearchParameters defaultsForQuery(WebsiteUrl url, String query, int page) { public static SearchParameters defaultsForQuery(WebsiteUrl url, String query, int page) {
return new SearchParameters( return new SearchParameters(
url, url,

View File

@@ -3,6 +3,7 @@ package nu.marginalia.control.app.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.control.ControlRendererFactory; import nu.marginalia.control.ControlRendererFactory;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@@ -81,7 +82,8 @@ public class SearchToBanService {
.setFetchSize(8192) .setFetchSize(8192)
.build() .build()
, ,
"NONE" "NONE",
NsfwFilterTier.OFF
)); ));
} }
} }

View File

@@ -44,6 +44,7 @@ dependencies {
implementation project(':code:functions:link-graph:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:favicon') implementation project(':code:functions:favicon')
implementation project(':code:functions:favicon:api') implementation project(':code:functions:favicon:api')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:crawling-process:model')

View File

@@ -3,13 +3,14 @@ package nu.marginalia.executor;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass; import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceId;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule; import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.NodeStatusWatcher; import nu.marginalia.service.server.NodeStatusWatcher;
@@ -27,6 +28,7 @@ public class ExecutorMain extends MainClass {
Injector injector = Guice.createInjector( Injector injector = Guice.createInjector(
new ExecutorModule(), new ExecutorModule(),
new DatabaseModule(false), new DatabaseModule(false),
new NsfwFilterModule(),
new ServiceDiscoveryModule(), new ServiceDiscoveryModule(),
new ServiceConfigurationModule(ServiceId.Executor) new ServiceConfigurationModule(ServiceId.Executor)
); );

View File

@@ -37,6 +37,7 @@ dependencies {
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')
implementation project(':code:functions:link-graph:api') implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:link-graph:aggregate') implementation project(':code:functions:link-graph:aggregate')
implementation project(':code:functions:nsfw-domain-filter')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j

View File

@@ -6,6 +6,7 @@ import com.google.inject.Inject;
import nu.marginalia.api.searchquery.RpcQueryLimits; import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters; import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.RpcTemporalBias; import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters; import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.functions.searchquery.QueryGRPCService;
@@ -61,7 +62,7 @@ public class QueryBasicInterface {
.setTimeoutMs(250) .setTimeoutMs(250)
.setFetchSize(8192) .setFetchSize(8192)
.build() .build()
, set); , set, NsfwFilterTier.OFF);
var pagination = new IndexClient.Pagination(page, count); var pagination = new IndexClient.Pagination(page, count);
@@ -114,7 +115,7 @@ public class QueryBasicInterface {
.setTimeoutMs(250) .setTimeoutMs(250)
.setFetchSize(8192) .setFetchSize(8192)
.build(), .build(),
set); set, NsfwFilterTier.OFF);
var pagination = new IndexClient.Pagination(page, count); var pagination = new IndexClient.Pagination(page, count);

View File

@@ -3,13 +3,14 @@ package nu.marginalia.query;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass; import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule; import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
public class QueryMain extends MainClass { public class QueryMain extends MainClass {
@@ -26,6 +27,7 @@ public class QueryMain extends MainClass {
Injector injector = Guice.createInjector( Injector injector = Guice.createInjector(
new QueryModule(), new QueryModule(),
new DatabaseModule(false), new DatabaseModule(false),
new NsfwFilterModule(),
new ServiceDiscoveryModule(), new ServiceDiscoveryModule(),
new ServiceConfigurationModule(ServiceId.Query) new ServiceConfigurationModule(ServiceId.Query)
); );

View File

@@ -10,3 +10,4 @@
2025-05-08: Deploy assistant. 2025-05-08: Deploy assistant.
2025-05-17: Redeploy all. 2025-05-17: Redeploy all.
2025-05-28: Deploy assistant and browserless. 2025-05-28: Deploy assistant and browserless.
2025-06-06: Deploy assistant and browserless.

View File

@@ -1,61 +0,0 @@
# This docker-compose file is for the screenshot-capture-tool service.
#
# It is a standalone daemon that captures screenshots of web pages, based
# on the domain database of Marginalia Search.
#
# It does not start the search engine itself.
#
x-svc: &service
env_file:
- "run/env/service.env"
volumes:
- conf:/wmsa/conf:ro
- data:/wmsa/data
- logs:/var/log/wmsa
networks:
- wmsa
services:
screenshot-capture-tool:
<<: *service
image: "marginalia/screenshot-capture-tool"
container_name: "screenshot-capture-tool"
networks:
- wmsa
- headlesschrome
depends_on:
- browserless
browserless:
<<: *service
image: "browserless/chrome"
container_name: "headlesschrome"
env_file:
- "run/env/browserless.env"
ports:
- "3000:3000"
networks:
- wmsa
- headlesschrome
networks:
wmsa:
headlesschrome:
volumes:
logs:
driver: local
driver_opts:
type: none
o: bind
device: run/logs
conf:
driver: local
driver_opts:
type: none
o: bind
device: run/conf
data:
driver: local
driver_opts:
type: none
o: bind
device: run/data

View File

@@ -20,6 +20,7 @@ include 'code:functions:favicon'
include 'code:functions:favicon:api' include 'code:functions:favicon:api'
include 'code:functions:domain-info' include 'code:functions:domain-info'
include 'code:functions:domain-info:api' include 'code:functions:domain-info:api'
include 'code:functions:nsfw-domain-filter'
include 'code:functions:link-graph:partition' include 'code:functions:link-graph:partition'
include 'code:functions:link-graph:aggregate' include 'code:functions:link-graph:aggregate'
@@ -153,9 +154,9 @@ dependencyResolutionManagement {
library('guice', 'com.google.inject', 'guice').version('7.0.0') library('guice', 'com.google.inject', 'guice').version('7.0.0')
library('guava', 'com.google.guava', 'guava').version('32.0.1-jre') library('guava', 'com.google.guava', 'guava').version('32.0.1-jre')
library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.16.3') library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.16.3')
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.49.2') library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.73.0')
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.49.2') library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.73.0')
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.49.2') library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.73.0')
library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0') library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0')
library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0') library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0')

View File

@@ -344,7 +344,7 @@ if __name__ == '__main__':
parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true') parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true')
parser.add_argument('-a', '--add', help='Add the tags provided as a new deployment tag, usually combined with -t', action='store_true') parser.add_argument('-a', '--add', help='Add the tags provided as a new deployment tag, usually combined with -t', action='store_true')
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-') parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-; Expecting tags on the format "+service", "-service", or "group"')
args = parser.parse_args() args = parser.parse_args()
tags = args.tag tags = args.tag
@@ -372,7 +372,7 @@ if __name__ == '__main__':
build_and_deploy(plan, SERVICE_CONFIG) build_and_deploy(plan, SERVICE_CONFIG)
else: else:
print("No tags found") print("No tags found.")
except ValueError as e: except ValueError as e:
print(f"Error: {e}") print(f"Error: {e}")