mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
12 Commits
deploy-020
...
deploy-020
Author | SHA1 | Date | |
---|---|---|---|
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d |
@@ -0,0 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||
ID INT NOT NULL AUTO_INCREMENT,
|
||||
TIER INT NOT NULL,
|
||||
PRIMARY KEY (ID)
|
||||
);
|
@@ -37,6 +37,7 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -6,6 +6,7 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -35,7 +36,8 @@ public enum ExecutorActor {
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME);
|
||||
UPDATE_RSS(NodeProfile.REALTIME)
|
||||
;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
|
@@ -68,6 +68,7 @@ public class ExecutorActorControlService {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@@ -109,6 +110,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -0,0 +1,53 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Sync NSFW filters";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
}
|
@@ -12,6 +12,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -35,14 +36,21 @@ public class DomSampleService {
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
browserlessURI = null; // satisfy final
|
||||
browserlessURI = null;
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (browserlessURI == null) {
|
||||
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||
return;
|
||||
}
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
}
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
Set<String> dbDomains = new HashSet<>();
|
||||
@@ -102,8 +110,7 @@ public class DomSampleService {
|
||||
private void updateDomain(BrowserlessClient client, String domain) {
|
||||
var rootUrl = "https://" + domain + "/";
|
||||
try {
|
||||
var content = client.annotatedContent(rootUrl,
|
||||
BrowserlessClient.GotoOptions.defaultValues());
|
||||
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||
|
||||
if (content.isPresent()) {
|
||||
db.saveSample(domain, rootUrl, content.get());
|
||||
|
@@ -26,7 +26,9 @@ public class DomSampleDb implements AutoCloseable {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void syncDomains(Set<String> domains) {
|
||||
@@ -151,8 +153,6 @@ public class DomSampleDb implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
record Request(String url, String method, String timestamp, boolean acceptedPopover) {}
|
||||
|
||||
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE
|
||||
|
@@ -141,7 +141,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
public record GotoOptions(String waitUntil, long timeout) {
|
||||
public static GotoOptions defaultValues() {
|
||||
return new GotoOptions("load", Duration.ofSeconds(10).toMillis());
|
||||
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
||||
}
|
||||
else {
|
||||
EdgeDomain domain = domainNameOpt.get();
|
||||
String domainNameStr = domain.toString();
|
||||
|
||||
if (!isValidDomainForCapture(domain)) {
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
|
@@ -108,7 +108,7 @@ public class BrowserlessClientTest {
|
||||
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||
) {
|
||||
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
dbop.saveSample("marginalia.nu", "https://www.thesodacanstove.com/alcohol-stove/how-to-build/", content);
|
||||
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||
System.out.println(content);
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
|
||||
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
implementation libs.commons.lang3
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation project(':code:common:service')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
@@ -0,0 +1,192 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
@Singleton
|
||||
public class NsfwDomainFilter {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final List<String> dangerLists;
|
||||
private final List<String> smutLists;
|
||||
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||
|
||||
public static final int NSFW_DISABLE = 0;
|
||||
public static final int NSFW_BLOCK_DANGER = 1;
|
||||
public static final int NSFW_BLOCK_SMUT = 2;
|
||||
|
||||
@Inject
|
||||
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||
@Named("nsfw.smutLists") List<String> smutLists
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.dangerLists = dangerLists;
|
||||
this.smutLists = smutLists;
|
||||
|
||||
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||
while (true) {
|
||||
sync();
|
||||
try {
|
||||
TimeUnit.HOURS.sleep(1);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break; // Exit the loop if interrupted
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isBlocked(int domainId, int tier) {
|
||||
if (tier == 0)
|
||||
return false;
|
||||
|
||||
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||
return true;
|
||||
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private synchronized void sync() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
int tier = rs.getInt("TIER");
|
||||
|
||||
switch (tier) {
|
||||
case 1 -> tier1.add(domainId);
|
||||
case 2 -> tier2.add(domainId);
|
||||
}
|
||||
}
|
||||
|
||||
this.blockedDomainIdsTier1 = tier1;
|
||||
this.blockedDomainIdsTier2 = tier2;
|
||||
|
||||
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to sync NSFW domain filter", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void fetchLists() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||
.build();
|
||||
var stmt = conn.createStatement();
|
||||
var insertStmt = conn.prepareStatement("INSERT INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||
|
||||
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||
|
||||
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||
for (var dangerListUrl : dangerLists) {
|
||||
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedDangerList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||
for (var smutListUrl : smutLists) {
|
||||
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedSmutList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.addBatch();
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
stmt.execute("""
|
||||
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||
""");
|
||||
stmt.execute("""
|
||||
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||
""");
|
||||
sync();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> fetchList(HttpClient client, String url) {
|
||||
|
||||
logger.info("Fetching NSFW domain list from {}", url);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(java.net.URI.create(url))
|
||||
.build();
|
||||
|
||||
try {
|
||||
if (url.endsWith(".gz")) {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
byte[] body = response.body();
|
||||
|
||||
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||
return reader.lines()
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} catch (Exception e) {
|
||||
logger.error("Error reading GZIP response from {}", url, e);
|
||||
}
|
||||
} else {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
if (response.statusCode() == 200) {
|
||||
|
||||
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} else {
|
||||
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||
}
|
||||
|
||||
|
||||
return List.of();
|
||||
}
|
||||
}
|
@@ -0,0 +1,30 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import jakarta.inject.Named;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class NsfwFilterModule extends AbstractModule {
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||
);
|
||||
}
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {}
|
||||
}
|
@@ -0,0 +1,108 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Provides;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class NsfwDomainFilterTest extends AbstractModule {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
static Path tempDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpDb() throws IOException {
|
||||
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||
|
||||
System.setProperty("system.homePath", tempDir.toString());
|
||||
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||
) {
|
||||
|
||||
// Ensure the database is ready
|
||||
conn.createStatement().execute("SELECT 1");
|
||||
|
||||
stmt.setString(1, "www.google.com");
|
||||
stmt.setString(2, "google.com");
|
||||
stmt.executeUpdate();
|
||||
stmt.setString(1, "www.bing.com");
|
||||
stmt.setString(2, "bing.com");
|
||||
stmt.executeUpdate();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to connect to the database", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list1"
|
||||
);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list2.gz"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
var filter = Guice
|
||||
.createInjector(this)
|
||||
.getInstance(NsfwDomainFilter.class);
|
||||
|
||||
filter.fetchLists();
|
||||
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
}
|
||||
|
||||
}
|
@@ -1,9 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
|
||||
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
|
||||
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
|
||||
request.getSearchSetIdentifier(),
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
|
||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||
.setSearchSetIdentifier(params.identifier())
|
||||
.setQueryStrategy(params.queryStrategy().name())
|
||||
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||
.build())
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum NsfwFilterTier {
|
||||
OFF(0),
|
||||
DANGER(1),
|
||||
PORN_AND_GAMBLING(2);
|
||||
|
||||
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||
|
||||
NsfwFilterTier(int codedValue) {
|
||||
this.codedValue = codedValue;
|
||||
}
|
||||
|
||||
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||
if (tier.codedValue == codedValue) {
|
||||
return tier;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||
}
|
||||
|
||||
public int getCodedValue() {
|
||||
return codedValue;
|
||||
}
|
||||
}
|
@@ -25,10 +25,11 @@ public record QueryParams(
|
||||
String identifier,
|
||||
QueryStrategy queryStrategy,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
NsfwFilterTier filterTier,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -43,6 +44,7 @@ public record QueryParams(
|
||||
identifier,
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
filterTier,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
|
||||
RpcQsQueryPagination pagination = 17;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
/* Query service query response */
|
||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
||||
RpcQueryLimits queryLimits = 10;
|
||||
string queryStrategy = 11; // Named query configuration
|
||||
RpcResultRankingParameters parameters = 12;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/* A tagged union encoding some limit on a field */
|
||||
message RpcSpecLimit {
|
||||
int32 value = 1;
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:index:query')
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -34,13 +35,16 @@ public class QueryGRPCService
|
||||
|
||||
|
||||
private final QueryFactory queryFactory;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final IndexClient indexClient;
|
||||
|
||||
@Inject
|
||||
public QueryGRPCService(QueryFactory queryFactory,
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
IndexClient indexClient)
|
||||
{
|
||||
this.queryFactory = queryFactory;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
|
||||
"NONE",
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
|
@@ -17,6 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -28,14 +30,26 @@ public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
Counter wmsa_index_query_count = Counter.build()
|
||||
.name("wmsa_nsfw_filter_result_count")
|
||||
.labelNames("tier")
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
DomainBlacklistImpl blacklist,
|
||||
NsfwDomainFilter nsfwDomainFilter
|
||||
) {
|
||||
this.channelPool = channelPoolFactory.createMulti(
|
||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||
IndexApiGrpc::newBlockingStub);
|
||||
this.blacklist = blacklist;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
@@ -52,7 +66,7 @@ public class IndexClient {
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
|
||||
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
List<RpcDecoratedResultItem> results =
|
||||
@@ -74,7 +88,7 @@ public class IndexClient {
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.filter(item -> !isBlacklisted(item, filterTier))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
@@ -83,8 +97,23 @@ public class IndexClient {
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
static String[] tierNames = {
|
||||
"OFF",
|
||||
"DANGER",
|
||||
"NSFW"
|
||||
};
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||
|
||||
if (blacklist.isBlacklisted(domainId)) {
|
||||
return true;
|
||||
}
|
||||
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.api.model.ApiSearchResultQueryDetails;
|
||||
import nu.marginalia.api.model.ApiSearchResults;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
@@ -29,9 +30,10 @@ public class ApiSearchOperator {
|
||||
|
||||
public ApiSearchResults query(String query,
|
||||
int count,
|
||||
int index)
|
||||
int index,
|
||||
NsfwFilterTier filterTier)
|
||||
{
|
||||
var rsp = queryClient.search(createParams(query, count, index));
|
||||
var rsp = queryClient.search(createParams(query, count, index, filterTier));
|
||||
|
||||
return new ApiSearchResults("RESTRICTED", query,
|
||||
rsp.results()
|
||||
@@ -42,7 +44,7 @@ public class ApiSearchOperator {
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private QueryParams createParams(String query, int count, int index) {
|
||||
private QueryParams createParams(String query, int count, int index, NsfwFilterTier filterTirer) {
|
||||
SearchSetIdentifier searchSet = selectSearchSet(index);
|
||||
|
||||
return new QueryParams(
|
||||
@@ -53,7 +55,8 @@ public class ApiSearchOperator {
|
||||
.setTimeoutMs(150)
|
||||
.setFetchSize(8192)
|
||||
.build(),
|
||||
searchSet.name());
|
||||
searchSet.name(),
|
||||
filterTirer);
|
||||
}
|
||||
|
||||
private SearchSetIdentifier selectSearchSet(int index) {
|
||||
|
@@ -6,6 +6,7 @@ import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.api.model.ApiLicense;
|
||||
import nu.marginalia.api.model.ApiSearchResults;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.svc.LicenseService;
|
||||
import nu.marginalia.api.svc.RateLimiterService;
|
||||
import nu.marginalia.api.svc.ResponseCache;
|
||||
@@ -119,6 +120,16 @@ public class ApiService extends SparkService {
|
||||
|
||||
int count = intParam(request, "count", 20);
|
||||
int index = intParam(request, "index", 3);
|
||||
int nsfw = intParam(request, "nsfw", 1);
|
||||
|
||||
NsfwFilterTier nsfwFilterTier;
|
||||
try {
|
||||
nsfwFilterTier = NsfwFilterTier.fromCodedValue(nsfw);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
Spark.halt(400, "Invalid nsfw parameter value");
|
||||
return null; // Unreachable, but required to satisfy the compiler
|
||||
}
|
||||
|
||||
logger.info(queryMarker, "{} Search {}", license.key, query);
|
||||
|
||||
@@ -126,7 +137,7 @@ public class ApiService extends SparkService {
|
||||
.labels(license.key)
|
||||
.time(() ->
|
||||
searchOperator
|
||||
.query(query, count, index)
|
||||
.query(query, count, index, nsfwFilterTier)
|
||||
.withLicense(license.getLicense())
|
||||
);
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
@@ -52,6 +53,7 @@ public class SearchQueryParamFactory {
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
userParams.filterTier(),
|
||||
userParams.page()
|
||||
);
|
||||
|
||||
@@ -78,6 +80,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
1
|
||||
);
|
||||
}
|
||||
@@ -98,6 +101,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.DANGER,
|
||||
1
|
||||
);
|
||||
}
|
||||
@@ -118,6 +122,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.DANGER,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
@@ -23,6 +24,10 @@ public record SearchParameters(String query,
|
||||
int page
|
||||
) {
|
||||
|
||||
public NsfwFilterTier filterTier() {
|
||||
return NsfwFilterTier.DANGER;
|
||||
}
|
||||
|
||||
public SearchParameters(String queryString, Request request) {
|
||||
this(
|
||||
queryString,
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
@@ -53,6 +54,7 @@ public class SearchQueryParamFactory {
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
userParams.filterTier(),
|
||||
userParams.page()
|
||||
);
|
||||
|
||||
@@ -79,6 +81,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
page
|
||||
);
|
||||
}
|
||||
@@ -99,6 +102,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.DANGER,
|
||||
page
|
||||
);
|
||||
}
|
||||
@@ -119,6 +123,7 @@ public class SearchQueryParamFactory {
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.DANGER,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
@@ -18,6 +18,7 @@ import nu.marginalia.service.server.JoobyService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
@@ -41,6 +42,8 @@ public class SearchService extends JoobyService {
|
||||
.help("Search service error count")
|
||||
.register();
|
||||
|
||||
private final String openSearchXML;
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
@@ -69,6 +72,13 @@ public class SearchService extends JoobyService {
|
||||
this.siteSubscriptionService = siteSubscriptionService;
|
||||
this.faviconClient = faviconClient;
|
||||
this.domainQueries = domainQueries;
|
||||
|
||||
try (var is = ClassLoader.getSystemResourceAsStream("static/opensearch.xml")) {
|
||||
openSearchXML = new String(is.readAllBytes(), StandardCharsets.UTF_8);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load OpenSearch XML", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -82,6 +92,11 @@ public class SearchService extends JoobyService {
|
||||
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
|
||||
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
|
||||
|
||||
jooby.get("/opensearch.xml", ctx -> {
|
||||
ctx.setResponseType(MediaType.valueOf("application/opensearchdescription+xml"));
|
||||
return openSearchXML;
|
||||
});
|
||||
|
||||
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
|
||||
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||
String domain = ctx.path("domain").value();
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -24,6 +25,10 @@ public record SearchParameters(WebsiteUrl url,
|
||||
int page
|
||||
) {
|
||||
|
||||
public NsfwFilterTier filterTier() {
|
||||
return NsfwFilterTier.DANGER;
|
||||
}
|
||||
|
||||
public static SearchParameters defaultsForQuery(WebsiteUrl url, String query, int page) {
|
||||
return new SearchParameters(
|
||||
url,
|
||||
|
@@ -5,6 +5,7 @@ import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.domsample.DomSampleService;
|
||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||
import nu.marginalia.functions.math.MathGrpcService;
|
||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||
@@ -30,6 +31,7 @@ public class AssistantService extends JoobyService {
|
||||
ScreenshotService screenshotService,
|
||||
DomainInfoGrpcService domainInfoGrpcService,
|
||||
LiveCaptureGrpcService liveCaptureGrpcService,
|
||||
DomSampleService domSampleService,
|
||||
FeedsGrpcService feedsGrpcService,
|
||||
MathGrpcService mathGrpcService,
|
||||
Suggestions suggestions)
|
||||
@@ -41,10 +43,11 @@ public class AssistantService extends JoobyService {
|
||||
liveCaptureGrpcService,
|
||||
feedsGrpcService),
|
||||
List.of());
|
||||
this.screenshotService = screenshotService;
|
||||
|
||||
this.screenshotService = screenshotService;
|
||||
this.suggestions = suggestions;
|
||||
|
||||
domSampleService.start();
|
||||
}
|
||||
|
||||
public void startJooby(Jooby jooby) {
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.control.app.svc;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.control.ControlRendererFactory;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -81,7 +82,8 @@ public class SearchToBanService {
|
||||
.setFetchSize(8192)
|
||||
.build()
|
||||
,
|
||||
"NONE"
|
||||
"NONE",
|
||||
NsfwFilterTier.OFF
|
||||
));
|
||||
}
|
||||
}
|
||||
|
@@ -44,6 +44,7 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:favicon')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -3,13 +3,14 @@ package nu.marginalia.executor;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.nsfw.NsfwFilterModule;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.service.server.NodeStatusWatcher;
|
||||
|
||||
@@ -27,6 +28,7 @@ public class ExecutorMain extends MainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new ExecutorModule(),
|
||||
new DatabaseModule(false),
|
||||
new NsfwFilterModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new ServiceConfigurationModule(ServiceId.Executor)
|
||||
);
|
||||
|
@@ -37,6 +37,7 @@ dependencies {
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:link-graph:aggregate')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@@ -6,6 +6,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
||||
@@ -61,7 +62,7 @@ public class QueryBasicInterface {
|
||||
.setTimeoutMs(250)
|
||||
.setFetchSize(8192)
|
||||
.build()
|
||||
, set);
|
||||
, set, NsfwFilterTier.OFF);
|
||||
|
||||
var pagination = new IndexClient.Pagination(page, count);
|
||||
|
||||
@@ -114,7 +115,7 @@ public class QueryBasicInterface {
|
||||
.setTimeoutMs(250)
|
||||
.setFetchSize(8192)
|
||||
.build(),
|
||||
set);
|
||||
set, NsfwFilterTier.OFF);
|
||||
|
||||
var pagination = new IndexClient.Pagination(page, count);
|
||||
|
||||
|
@@ -3,13 +3,14 @@ package nu.marginalia.query;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.nsfw.NsfwFilterModule;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
|
||||
public class QueryMain extends MainClass {
|
||||
@@ -26,6 +27,7 @@ public class QueryMain extends MainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new QueryModule(),
|
||||
new DatabaseModule(false),
|
||||
new NsfwFilterModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new ServiceConfigurationModule(ServiceId.Query)
|
||||
);
|
||||
|
@@ -10,3 +10,4 @@
|
||||
2025-05-08: Deploy assistant.
|
||||
2025-05-17: Redeploy all.
|
||||
2025-05-28: Deploy assistant and browserless.
|
||||
2025-06-06: Deploy assistant and browserless.
|
@@ -1,61 +0,0 @@
|
||||
# This docker-compose file is for the screenshot-capture-tool service.
|
||||
#
|
||||
# It is a standalone daemon that captures screenshots of web pages, based
|
||||
# on the domain database of Marginalia Search.
|
||||
#
|
||||
# It does not start the search engine itself.
|
||||
#
|
||||
|
||||
x-svc: &service
|
||||
env_file:
|
||||
- "run/env/service.env"
|
||||
volumes:
|
||||
- conf:/wmsa/conf:ro
|
||||
- data:/wmsa/data
|
||||
- logs:/var/log/wmsa
|
||||
networks:
|
||||
- wmsa
|
||||
services:
|
||||
screenshot-capture-tool:
|
||||
<<: *service
|
||||
image: "marginalia/screenshot-capture-tool"
|
||||
container_name: "screenshot-capture-tool"
|
||||
networks:
|
||||
- wmsa
|
||||
- headlesschrome
|
||||
depends_on:
|
||||
- browserless
|
||||
browserless:
|
||||
<<: *service
|
||||
image: "browserless/chrome"
|
||||
container_name: "headlesschrome"
|
||||
env_file:
|
||||
- "run/env/browserless.env"
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- wmsa
|
||||
- headlesschrome
|
||||
|
||||
networks:
|
||||
wmsa:
|
||||
headlesschrome:
|
||||
volumes:
|
||||
logs:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/logs
|
||||
conf:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/conf
|
||||
data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: run/data
|
@@ -20,6 +20,7 @@ include 'code:functions:favicon'
|
||||
include 'code:functions:favicon:api'
|
||||
include 'code:functions:domain-info'
|
||||
include 'code:functions:domain-info:api'
|
||||
include 'code:functions:nsfw-domain-filter'
|
||||
|
||||
include 'code:functions:link-graph:partition'
|
||||
include 'code:functions:link-graph:aggregate'
|
||||
@@ -153,9 +154,9 @@ dependencyResolutionManagement {
|
||||
library('guice', 'com.google.inject', 'guice').version('7.0.0')
|
||||
library('guava', 'com.google.guava', 'guava').version('32.0.1-jre')
|
||||
library('protobuf', 'com.google.protobuf', 'protobuf-java').version('3.16.3')
|
||||
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.49.2')
|
||||
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.49.2')
|
||||
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.49.2')
|
||||
library('grpc-protobuf', 'io.grpc', 'grpc-protobuf').version('1.73.0')
|
||||
library('grpc-stub', 'io.grpc', 'grpc-stub').version('1.73.0')
|
||||
library('grpc-netty', 'io.grpc', 'grpc-netty-shaded').version('1.73.0')
|
||||
|
||||
library('prometheus', 'io.prometheus', 'simpleclient').version('0.16.0')
|
||||
library('prometheus-servlet', 'io.prometheus', 'simpleclient_servlet').version('0.16.0')
|
||||
|
@@ -272,9 +272,9 @@ if __name__ == '__main__':
|
||||
deploy_tier=1,
|
||||
groups={"all", "core"}
|
||||
),
|
||||
'headlesschrome': ServiceConfig(
|
||||
'browserless': ServiceConfig(
|
||||
gradle_target=':code:tools:browserless:docker',
|
||||
docker_name='headlesschrome',
|
||||
docker_name='browserless',
|
||||
instances=None,
|
||||
deploy_tier=2,
|
||||
groups={"all", "core"}
|
||||
@@ -344,7 +344,7 @@ if __name__ == '__main__':
|
||||
|
||||
parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true')
|
||||
parser.add_argument('-a', '--add', help='Add the tags provided as a new deployment tag, usually combined with -t', action='store_true')
|
||||
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-')
|
||||
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-; Expecting tags on the format "+service", "-service", or "group"')
|
||||
|
||||
args = parser.parse_args()
|
||||
tags = args.tag
|
||||
@@ -372,7 +372,7 @@ if __name__ == '__main__':
|
||||
|
||||
build_and_deploy(plan, SERVICE_CONFIG)
|
||||
else:
|
||||
print("No tags found")
|
||||
print("No tags found.")
|
||||
|
||||
except ValueError as e:
|
||||
print(f"Error: {e}")
|
||||
|
Reference in New Issue
Block a user