1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

6 Commits

Author SHA1 Message Date
Viktor Lofgren
37aaa90dc9 (deploy) Clean up deploy script 2025-06-07 13:43:56 +02:00
Viktor
24022c5adc Merge pull request #203 from MarginaliaSearch/nsfw-domain-lists
Nsfw blocking via UT1 domain lists
2025-06-07 13:24:05 +02:00
Viktor Lofgren
1de9ecc0b6 (nsfw) Add metrics to the filtering so we can monitor it 2025-06-07 13:17:05 +02:00
Viktor Lofgren
9b80245ea0 (nsfw) Move filtering to the IndexApiClient, and add filtering options to the internal APIs and public API. 2025-06-07 12:54:20 +02:00
Viktor Lofgren
4e1595c1a6 (nsfw) Initial work on adding UT1-based domain filtering 2025-06-06 14:23:37 +02:00
Viktor Lofgren
0be8585fa5 Add tag format hint to deploy script 2025-06-06 10:03:18 +02:00
32 changed files with 691 additions and 122 deletions

View File

@@ -0,0 +1,5 @@
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
ID INT NOT NULL AUTO_INCREMENT,
TIER INT NOT NULL,
PRIMARY KEY (ID)
);

View File

@@ -37,6 +37,7 @@ dependencies {
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:search-query')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:execution:api')
implementation project(':code:processes:crawling-process:model')

View File

@@ -6,6 +6,7 @@ import java.util.Set;
public enum ExecutorActor {
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
@@ -35,7 +36,8 @@ public enum ExecutorActor {
LIVE_CRAWL(NodeProfile.REALTIME),
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
SCRAPE_FEEDS(NodeProfile.REALTIME),
UPDATE_RSS(NodeProfile.REALTIME);
UPDATE_RSS(NodeProfile.REALTIME)
;
public String id() {
return "fsm:" + name().toLowerCase();

View File

@@ -68,6 +68,7 @@ public class ExecutorActorControlService {
ExecutorActorStateMachines stateMachines,
MigrateCrawlDataActor migrateCrawlDataActor,
ExportAllPrecessionActor exportAllPrecessionActor,
UpdateNsfwFiltersActor updateNsfwFiltersActor,
UpdateRssActor updateRssActor) throws SQLException {
this.messageQueueFactory = messageQueueFactory;
this.eventLog = baseServiceParams.eventLog;
@@ -109,6 +110,7 @@ public class ExecutorActorControlService {
register(ExecutorActor.UPDATE_RSS, updateRssActor);
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
if (serviceConfiguration.node() == 1) {
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);

View File

@@ -0,0 +1,53 @@
package nu.marginalia.actor.task;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.module.ServiceConfiguration;
@Singleton
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
private final ServiceConfiguration serviceConfiguration;
private final NsfwDomainFilter nsfwDomainFilter;
public record Initial() implements ActorStep {}
public record Run() implements ActorStep {}
@Override
public ActorStep transition(ActorStep self) throws Exception {
return switch(self) {
case Initial() -> {
if (serviceConfiguration.node() != 1) {
yield new Error("This actor can only run on node 1");
}
else {
yield new Run();
}
}
case Run() -> {
nsfwDomainFilter.fetchLists();
yield new End();
}
default -> new Error();
};
}
@Override
public String describe() {
return "Sync NSFW filters";
}
@Inject
public UpdateNsfwFiltersActor(Gson gson,
ServiceConfiguration serviceConfiguration,
NsfwDomainFilter nsfwDomainFilter)
{
super(gson);
this.serviceConfiguration = serviceConfiguration;
this.nsfwDomainFilter = nsfwDomainFilter;
}
}

View File

@@ -0,0 +1,43 @@
plugins {
id 'java'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:config')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.guava
implementation libs.commons.lang3
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.notnull
implementation libs.fastutil
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation project(':code:common:service')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View File

@@ -0,0 +1,192 @@
package nu.marginalia.nsfw;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
@Singleton
public class NsfwDomainFilter {
private final HikariDataSource dataSource;
private final List<String> dangerLists;
private final List<String> smutLists;
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
public static final int NSFW_DISABLE = 0;
public static final int NSFW_BLOCK_DANGER = 1;
public static final int NSFW_BLOCK_SMUT = 2;
@Inject
public NsfwDomainFilter(HikariDataSource dataSource,
@Named("nsfw.dangerLists") List<String> dangerLists,
@Named("nsfw.smutLists") List<String> smutLists
) {
this.dataSource = dataSource;
this.dangerLists = dangerLists;
this.smutLists = smutLists;
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
while (true) {
sync();
try {
TimeUnit.HOURS.sleep(1);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break; // Exit the loop if interrupted
}
}
});
}
public boolean isBlocked(int domainId, int tier) {
if (tier == 0)
return false;
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
return true;
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
return true;
return false;
}
private synchronized void sync() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
) {
var rs = stmt.executeQuery();
IntOpenHashSet tier1 = new IntOpenHashSet();
IntOpenHashSet tier2 = new IntOpenHashSet();
while (rs.next()) {
int domainId = rs.getInt("ID");
int tier = rs.getInt("TIER");
switch (tier) {
case 1 -> tier1.add(domainId);
case 2 -> tier2.add(domainId);
}
}
this.blockedDomainIdsTier1 = tier1;
this.blockedDomainIdsTier2 = tier2;
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
}
catch (SQLException ex) {
logger.error("Failed to sync NSFW domain filter", ex);
}
}
public synchronized void fetchLists() {
try (var conn = dataSource.getConnection();
HttpClient client = HttpClient.newBuilder()
.followRedirects(HttpClient.Redirect.ALWAYS)
.build();
var stmt = conn.createStatement();
var insertStmt = conn.prepareStatement("INSERT INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
List<String> combinedDangerList = new ArrayList<>(10_000);
for (var dangerListUrl : dangerLists) {
combinedDangerList.addAll(fetchList(client, dangerListUrl));
}
for (String domain : combinedDangerList) {
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
insertStmt.setString(2, domain);
insertStmt.execute();
}
List<String> combinedSmutList = new ArrayList<>(10_000);
for (var smutListUrl : smutLists) {
combinedSmutList.addAll(fetchList(client, smutListUrl));
}
for (String domain : combinedSmutList) {
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
insertStmt.setString(2, domain);
insertStmt.addBatch();
insertStmt.execute();
}
stmt.execute("""
DROP TABLE IF EXISTS NSFW_DOMAINS
""");
stmt.execute("""
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
""");
sync();
}
catch (SQLException ex) {
logger.error("Failed to fetch NSFW domain lists", ex);
}
}
public List<String> fetchList(HttpClient client, String url) {
logger.info("Fetching NSFW domain list from {}", url);
var request = HttpRequest.newBuilder()
.uri(java.net.URI.create(url))
.build();
try {
if (url.endsWith(".gz")) {
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
byte[] body = response.body();
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
return reader.lines()
.filter(StringUtils::isNotEmpty)
.toList();
} catch (Exception e) {
logger.error("Error reading GZIP response from {}", url, e);
}
} else {
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() == 200) {
return Arrays.stream(StringUtils.split(response.body(), "\n"))
.filter(StringUtils::isNotEmpty)
.toList();
} else {
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
}
}
}
catch (Exception e) {
logger.error("Error fetching NSFW domain list from {}", url, e);
}
return List.of();
}
}

View File

@@ -0,0 +1,30 @@
package nu.marginalia.nsfw;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import jakarta.inject.Named;
import java.util.List;
public class NsfwFilterModule extends AbstractModule {
@Provides
@Named("nsfw.dangerLists")
public List<String> nsfwDomainLists1() {
return List.of(
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
);
}
@Provides
@Named("nsfw.smutLists")
public List<String> nsfwDomainLists2() {
return List.of(
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
);
}
public void configure() {}
}

View File

@@ -0,0 +1,108 @@
package nu.marginalia.nsfw;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.inject.Provides;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import jakarta.inject.Named;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow")
@Testcontainers
class NsfwDomainFilterTest extends AbstractModule {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
static HikariDataSource dataSource;
static Path tempDir;
@BeforeAll
public static void setUpDb() throws IOException {
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
System.setProperty("system.homePath", tempDir.toString());
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
) {
// Ensure the database is ready
conn.createStatement().execute("SELECT 1");
stmt.setString(1, "www.google.com");
stmt.setString(2, "google.com");
stmt.executeUpdate();
stmt.setString(1, "www.bing.com");
stmt.setString(2, "bing.com");
stmt.executeUpdate();
} catch (Exception e) {
throw new RuntimeException("Failed to connect to the database", e);
}
}
@Provides
@Named("nsfw.dangerLists")
public List<String> nsfwDomainLists1() {
return List.of(
"https://downloads.marginalia.nu/test/list1"
);
}
@Provides
@Named("nsfw.smutLists")
public List<String> nsfwDomainLists2() {
return List.of(
"https://downloads.marginalia.nu/test/list2.gz"
);
}
public void configure() {
bind(HikariDataSource.class).toInstance(dataSource);
}
@Test
public void test() {
var filter = Guice
.createInjector(this)
.getInstance(NsfwDomainFilter.class);
filter.fetchLists();
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
}
}

View File

@@ -1,9 +1,6 @@
package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(request.getHumanQuery());
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
builder.setHumanQuery(humanQuery);
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
request.getSearchSetIdentifier(),
QueryStrategy.valueOf(request.getQueryStrategy()),
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
request.getPagination().getPage()
);
}
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
.setSearchSetIdentifier(params.identifier())
.setQueryStrategy(params.queryStrategy().name())
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
.setTemporalBias(RpcTemporalBias.newBuilder()
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
.build())

View File

@@ -0,0 +1,26 @@
package nu.marginalia.api.searchquery.model.query;
public enum NsfwFilterTier {
OFF(0),
DANGER(1),
PORN_AND_GAMBLING(2);
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
NsfwFilterTier(int codedValue) {
this.codedValue = codedValue;
}
public static NsfwFilterTier fromCodedValue(int codedValue) {
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
if (tier.codedValue == codedValue) {
return tier;
}
}
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
}
public int getCodedValue() {
return codedValue;
}
}

View File

@@ -25,10 +25,11 @@ public record QueryParams(
String identifier,
QueryStrategy queryStrategy,
RpcTemporalBias.Bias temporalBias,
NsfwFilterTier filterTier,
int page
)
{
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
this(query, null,
List.of(),
List.of(),
@@ -43,6 +44,7 @@ public record QueryParams(
identifier,
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
filterTier,
1 // page
);
}

View File

@@ -32,6 +32,14 @@ message RpcQsQuery {
RpcTemporalBias temporalBias = 16;
RpcQsQueryPagination pagination = 17;
NSFW_FILTER_TIER nsfwFilterTier = 18;
enum NSFW_FILTER_TIER {
NONE = 0;
DANGER = 1;
PORN_AND_GAMBLING = 2;
};
}
/* Query service query response */
@@ -78,8 +86,17 @@ message RpcIndexQuery {
RpcQueryLimits queryLimits = 10;
string queryStrategy = 11; // Named query configuration
RpcResultRankingParameters parameters = 12;
NSFW_FILTER_TIER nsfwFilterTier = 13;
enum NSFW_FILTER_TIER {
NONE = 0;
DANGER = 1;
PORN_AND_GAMBLING = 2;
};
}
/* A tagged union encoding some limit on a field */
message RpcSpecLimit {
int32 value = 1;

View File

@@ -19,6 +19,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:query')

View File

@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.index.api.IndexClient;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.server.DiscoverableService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -34,13 +35,16 @@ public class QueryGRPCService
private final QueryFactory queryFactory;
private final NsfwDomainFilter nsfwDomainFilter;
private final IndexClient indexClient;
@Inject
public QueryGRPCService(QueryFactory queryFactory,
NsfwDomainFilter nsfwDomainFilter,
IndexClient indexClient)
{
this.queryFactory = queryFactory;
this.nsfwDomainFilter = nsfwDomainFilter;
this.indexClient = indexClient;
}

View File

@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.functions.searchquery.QueryFactory;
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
"NONE",
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
0), null).specs;
}

View File

@@ -17,6 +17,7 @@ dependencies {
implementation project(':code:common:service')
implementation project(':code:common:db')
implementation project(':code:libraries:message-queue')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:functions:search-query:api')
implementation libs.bundles.slf4j

View File

@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Counter;
import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.nsfw.NsfwDomainFilter;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
@@ -28,14 +30,26 @@ public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private final DomainBlacklistImpl blacklist;
private final NsfwDomainFilter nsfwDomainFilter;
Counter wmsa_index_query_count = Counter.build()
.name("wmsa_nsfw_filter_result_count")
.labelNames("tier")
.help("Count of results filtered by NSFW tier")
.register();
private static final ExecutorService executor = Executors.newCachedThreadPool();
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
DomainBlacklistImpl blacklist,
NsfwDomainFilter nsfwDomainFilter
) {
this.channelPool = channelPoolFactory.createMulti(
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
IndexApiGrpc::newBlockingStub);
this.blacklist = blacklist;
this.nsfwDomainFilter = nsfwDomainFilter;
}
private static final Comparator<RpcDecoratedResultItem> comparator =
@@ -52,7 +66,7 @@ public class IndexClient {
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
int filterTier = indexRequest.getNsfwFilterTierValue();
AtomicInteger totalNumResults = new AtomicInteger(0);
List<RpcDecoratedResultItem> results =
@@ -74,7 +88,7 @@ public class IndexClient {
}
})
.flatMap(List::stream)
.filter(item -> !isBlacklisted(item))
.filter(item -> !isBlacklisted(item, filterTier))
.sorted(comparator)
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
.limit(pagination.pageSize)
@@ -83,8 +97,23 @@ public class IndexClient {
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
}
private boolean isBlacklisted(RpcDecoratedResultItem item) {
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
static String[] tierNames = {
"OFF",
"DANGER",
"NSFW"
};
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
if (blacklist.isBlacklisted(domainId)) {
return true;
}
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
return true;
}
return false;
}
}

View File

@@ -7,6 +7,7 @@ import nu.marginalia.api.model.ApiSearchResultQueryDetails;
import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
@@ -29,9 +30,10 @@ public class ApiSearchOperator {
public ApiSearchResults query(String query,
int count,
int index)
int index,
NsfwFilterTier filterTier)
{
var rsp = queryClient.search(createParams(query, count, index));
var rsp = queryClient.search(createParams(query, count, index, filterTier));
return new ApiSearchResults("RESTRICTED", query,
rsp.results()
@@ -42,7 +44,7 @@ public class ApiSearchOperator {
.collect(Collectors.toList()));
}
private QueryParams createParams(String query, int count, int index) {
private QueryParams createParams(String query, int count, int index, NsfwFilterTier filterTirer) {
SearchSetIdentifier searchSet = selectSearchSet(index);
return new QueryParams(
@@ -53,7 +55,8 @@ public class ApiSearchOperator {
.setTimeoutMs(150)
.setFetchSize(8192)
.build(),
searchSet.name());
searchSet.name(),
filterTirer);
}
private SearchSetIdentifier selectSearchSet(int index) {

View File

@@ -6,6 +6,7 @@ import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.api.model.ApiLicense;
import nu.marginalia.api.model.ApiSearchResults;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.svc.LicenseService;
import nu.marginalia.api.svc.RateLimiterService;
import nu.marginalia.api.svc.ResponseCache;
@@ -119,6 +120,16 @@ public class ApiService extends SparkService {
int count = intParam(request, "count", 20);
int index = intParam(request, "index", 3);
int nsfw = intParam(request, "nsfw", 1);
NsfwFilterTier nsfwFilterTier;
try {
nsfwFilterTier = NsfwFilterTier.fromCodedValue(nsfw);
}
catch (IllegalArgumentException e) {
Spark.halt(400, "Invalid nsfw parameter value");
return null; // Unreachable, but required to satisfy the compiler
}
logger.info(queryMarker, "{} Search {}", license.key, query);
@@ -126,7 +137,7 @@ public class ApiService extends SparkService {
.labels(license.key)
.time(() ->
searchOperator
.query(query, count, index)
.query(query, count, index, nsfwFilterTier)
.withLicense(license.getLicense())
);
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
@@ -52,6 +53,7 @@ public class SearchQueryParamFactory {
profile.searchSetIdentifier.name(),
userParams.strategy(),
userParams.temporalBias(),
userParams.filterTier(),
userParams.page()
);
@@ -78,6 +80,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
1
);
}
@@ -98,6 +101,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1
);
}
@@ -118,6 +122,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1
);
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.model.SearchProfile;
@@ -23,6 +24,10 @@ public record SearchParameters(String query,
int page
) {
public NsfwFilterTier filterTier() {
return NsfwFilterTier.DANGER;
}
public SearchParameters(String queryString, Request request) {
this(
queryString,

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
@@ -53,6 +54,7 @@ public class SearchQueryParamFactory {
profile.searchSetIdentifier.name(),
userParams.strategy(),
userParams.temporalBias(),
userParams.filterTier(),
userParams.page()
);
@@ -79,6 +81,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.OFF,
page
);
}
@@ -99,6 +102,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
page
);
}
@@ -119,6 +123,7 @@ public class SearchQueryParamFactory {
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
RpcTemporalBias.Bias.NONE,
NsfwFilterTier.DANGER,
1
);
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.EdgeDomain;
@@ -24,6 +25,10 @@ public record SearchParameters(WebsiteUrl url,
int page
) {
public NsfwFilterTier filterTier() {
return NsfwFilterTier.DANGER;
}
public static SearchParameters defaultsForQuery(WebsiteUrl url, String query, int page) {
return new SearchParameters(
url,

View File

@@ -3,6 +3,7 @@ package nu.marginalia.control.app.svc;
import com.google.inject.Inject;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.control.ControlRendererFactory;
import nu.marginalia.model.EdgeUrl;
@@ -81,7 +82,8 @@ public class SearchToBanService {
.setFetchSize(8192)
.build()
,
"NONE"
"NONE",
NsfwFilterTier.OFF
));
}
}

View File

@@ -44,6 +44,7 @@ dependencies {
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:favicon')
implementation project(':code:functions:favicon:api')
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')

View File

@@ -3,13 +3,14 @@ package nu.marginalia.executor;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.service.server.NodeStatusWatcher;
@@ -27,6 +28,7 @@ public class ExecutorMain extends MainClass {
Injector injector = Guice.createInjector(
new ExecutorModule(),
new DatabaseModule(false),
new NsfwFilterModule(),
new ServiceDiscoveryModule(),
new ServiceConfigurationModule(ServiceId.Executor)
);

View File

@@ -37,6 +37,7 @@ dependencies {
implementation project(':code:functions:search-query:api')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:link-graph:aggregate')
implementation project(':code:functions:nsfw-domain-filter')
implementation libs.bundles.slf4j

View File

@@ -6,6 +6,7 @@ import com.google.inject.Inject;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.RpcTemporalBias;
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.functions.searchquery.QueryGRPCService;
@@ -61,7 +62,7 @@ public class QueryBasicInterface {
.setTimeoutMs(250)
.setFetchSize(8192)
.build()
, set);
, set, NsfwFilterTier.OFF);
var pagination = new IndexClient.Pagination(page, count);
@@ -114,7 +115,7 @@ public class QueryBasicInterface {
.setTimeoutMs(250)
.setFetchSize(8192)
.build(),
set);
set, NsfwFilterTier.OFF);
var pagination = new IndexClient.Pagination(page, count);

View File

@@ -3,13 +3,14 @@ package nu.marginalia.query;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.nsfw.NsfwFilterModule;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization;
public class QueryMain extends MainClass {
@@ -26,6 +27,7 @@ public class QueryMain extends MainClass {
Injector injector = Guice.createInjector(
new QueryModule(),
new DatabaseModule(false),
new NsfwFilterModule(),
new ServiceDiscoveryModule(),
new ServiceConfigurationModule(ServiceId.Query)
);

View File

@@ -20,6 +20,7 @@ include 'code:functions:favicon'
include 'code:functions:favicon:api'
include 'code:functions:domain-info'
include 'code:functions:domain-info:api'
include 'code:functions:nsfw-domain-filter'
include 'code:functions:link-graph:partition'
include 'code:functions:link-graph:aggregate'
@@ -153,9 +154,9 @@ dependencyResolutionManagement {
library('guice', 'com.google.inject', 'guice').version('7.0.0')
library('guava', 'com.google.guava', 'guava').version('32.0.1-jre')
library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.16.3')
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.49.2')
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.49.2')
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.49.2')
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.73.0')
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.73.0')
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.73.0')
library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0')
library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0')

View File

@@ -5,9 +5,6 @@ import subprocess, os
from typing import List, Set, Dict, Optional
import argparse
build_dir = "/app/search.marginalia.nu/build"
docker_dir = "/app/search.marginalia.nu/docker"
@dataclass
class ServiceConfig:
"""Configuration for a service"""
@@ -17,6 +14,99 @@ class ServiceConfig:
deploy_tier: int
groups: Set[str]
# Define the service configurations
build_dir = "/app/search.marginalia.nu/build"
docker_dir = "/app/search.marginalia.nu/docker"
SERVICE_CONFIG = {
'search': ServiceConfig(
gradle_target=':code:services-application:search-service:docker',
docker_name='search-service',
instances=2,
deploy_tier=2,
groups={"all", "frontend", "core"}
),
'search-legacy': ServiceConfig(
gradle_target=':code:services-application:search-service-legacy:docker',
docker_name='search-service-legacy',
instances=None,
deploy_tier=3,
groups={"all", "frontend", "core"}
),
'api': ServiceConfig(
gradle_target=':code:services-application:api-service:docker',
docker_name='api-service',
instances=2,
deploy_tier=1,
groups={"all", "core"}
),
'browserless': ServiceConfig(
gradle_target=':code:tools:browserless:docker',
docker_name='browserless',
instances=None,
deploy_tier=2,
groups={"all", "core"}
),
'assistant': ServiceConfig(
gradle_target=':code:services-core:assistant-service:docker',
docker_name='assistant-service',
instances=2,
deploy_tier=2,
groups={"all", "core"}
),
'explorer': ServiceConfig(
gradle_target=':code:services-application:explorer-service:docker',
docker_name='explorer-service',
instances=None,
deploy_tier=1,
groups={"all", "extra"}
),
'dating': ServiceConfig(
gradle_target=':code:services-application:dating-service:docker',
docker_name='dating-service',
instances=None,
deploy_tier=1,
groups={"all", "extra"}
),
'index': ServiceConfig(
gradle_target=':code:services-core:index-service:docker',
docker_name='index-service',
instances=10,
deploy_tier=3,
groups={"all", "index"}
),
'executor': ServiceConfig(
gradle_target=':code:services-core:executor-service:docker',
docker_name='executor-service',
instances=10,
deploy_tier=3,
groups={"all", "executor"}
),
'control': ServiceConfig(
gradle_target=':code:services-core:control-service:docker',
docker_name='control-service',
instances=None,
deploy_tier=0,
groups={"all", "core"}
),
'status': ServiceConfig(
gradle_target=':code:services-application:status-service:docker',
docker_name='status-service',
instances=None,
deploy_tier=4,
groups={"all"}
),
'query': ServiceConfig(
gradle_target=':code:services-core:query-service:docker',
docker_name='query-service',
instances=2,
deploy_tier=2,
groups={"all", "query"}
),
}
@dataclass
class DeploymentPlan:
services_to_build: List[str]
@@ -76,7 +166,7 @@ def parse_deployment_tags(
instances_to_hold = set()
available_services = set(service_config.keys())
available_groups = set()
available_groups = set.union(*[service.groups for service in service_config.values()])
partitions = set()
@@ -89,7 +179,6 @@ def parse_deployment_tags(
partitions.add(int(p))
if tag.startswith('deploy:'):
parts = tag[7:].strip().split(',')
for part in parts:
part = part.strip()
@@ -250,92 +339,7 @@ def add_tags(tags: str) -> None:
# Example usage:
if __name__ == '__main__':
# Define service configuration
SERVICE_CONFIG = {
'search': ServiceConfig(
gradle_target=':code:services-application:search-service:docker',
docker_name='search-service',
instances=2,
deploy_tier=2,
groups={"all", "frontend", "core"}
),
'search-legacy': ServiceConfig(
gradle_target=':code:services-application:search-service-legacy:docker',
docker_name='search-service-legacy',
instances=None,
deploy_tier=3,
groups={"all", "frontend", "core"}
),
'api': ServiceConfig(
gradle_target=':code:services-application:api-service:docker',
docker_name='api-service',
instances=2,
deploy_tier=1,
groups={"all", "core"}
),
'browserless': ServiceConfig(
gradle_target=':code:tools:browserless:docker',
docker_name='browserless',
instances=None,
deploy_tier=2,
groups={"all", "core"}
),
'assistant': ServiceConfig(
gradle_target=':code:services-core:assistant-service:docker',
docker_name='assistant-service',
instances=2,
deploy_tier=2,
groups={"all", "core"}
),
'explorer': ServiceConfig(
gradle_target=':code:services-application:explorer-service:docker',
docker_name='explorer-service',
instances=None,
deploy_tier=1,
groups={"all", "extra"}
),
'dating': ServiceConfig(
gradle_target=':code:services-application:dating-service:docker',
docker_name='dating-service',
instances=None,
deploy_tier=1,
groups={"all", "extra"}
),
'index': ServiceConfig(
gradle_target=':code:services-core:index-service:docker',
docker_name='index-service',
instances=10,
deploy_tier=3,
groups={"all", "index"}
),
'executor': ServiceConfig(
gradle_target=':code:services-core:executor-service:docker',
docker_name='executor-service',
instances=10,
deploy_tier=3,
groups={"all", "executor"}
),
'control': ServiceConfig(
gradle_target=':code:services-core:control-service:docker',
docker_name='control-service',
instances=None,
deploy_tier=0,
groups={"all", "core"}
),
'status': ServiceConfig(
gradle_target=':code:services-application:status-service:docker',
docker_name='status-service',
instances=None,
deploy_tier=4,
groups={"all"}
),
'query': ServiceConfig(
gradle_target=':code:services-core:query-service:docker',
docker_name='query-service',
instances=2,
deploy_tier=2,
groups={"all", "query"}
),
}
try:
parser = argparse.ArgumentParser(
@@ -344,7 +348,7 @@ if __name__ == '__main__':
parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true')
parser.add_argument('-a', '--add', help='Add the tags provided as a new deployment tag, usually combined with -t', action='store_true')
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-')
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-; Expecting tags on the format "+service", "-service", or "group"')
args = parser.parse_args()
tags = args.tag
@@ -372,7 +376,7 @@ if __name__ == '__main__':
build_and_deploy(plan, SERVICE_CONFIG)
else:
print("No tags found")
print("No tags found.")
except ValueError as e:
print(f"Error: {e}")