mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
59 Commits
deploy-023
...
deploy-028
Author | SHA1 | Date | |
---|---|---|---|
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
390f053406 | ||
|
b03c43224c | ||
|
9b4ce9e9eb | ||
|
81ac02a695 | ||
|
47f624fb3b | ||
|
c866f19cbb | ||
|
518278493b | ||
|
1ac0bab0b8 | ||
|
08b45ed10a | ||
|
f2cfb91973 | ||
|
2f79524eb3 | ||
|
3b00142c96 | ||
|
294ab19177 | ||
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde |
@@ -48,10 +48,6 @@ filter for any API consumer.
|
||||
|
||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||
|
||||
## Show favicons next to search results
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Specialized crawler for github
|
||||
|
||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||
@@ -66,6 +62,10 @@ The documents database probably should have some sort of flag indicating it's a
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Show favicons next to search results (COMPLETED 2025-03)
|
||||
|
||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(2, config.acceptQueries());
|
||||
us.setBoolean(3, config.autoClean());
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
us.setBoolean(5, config.autoAssignDomains());
|
||||
us.setBoolean(6, config.keepWarcs());
|
||||
us.setBoolean(7, config.disabled());
|
||||
us.setString(8, config.profile().name());
|
||||
us.setInt(9, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
||||
boolean acceptQueries,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean autoAssignDomains,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
return isBatchCrawl() || isMixed();
|
||||
}
|
||||
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
||||
assertEquals(2, list.size());
|
||||
assertEquals(a, list.get(0));
|
||||
assertEquals(b, list.get(1));
|
||||
}
|
||||
|
||||
|
||||
// Test all the fields that are only exposed via save()
|
||||
@Test
|
||||
public void testSaveChanges() throws SQLException {
|
||||
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, original.node());
|
||||
assertEquals("Test", original.description());
|
||||
assertFalse(original.acceptQueries());
|
||||
|
||||
var precession = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
!original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(precession);
|
||||
precession = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||
|
||||
var autoClean = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
!original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(autoClean);
|
||||
autoClean = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||
|
||||
var disabled = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
autoClean.autoClean(),
|
||||
autoClean.includeInPrecession(),
|
||||
autoClean.autoAssignDomains(),
|
||||
autoClean.keepWarcs(),
|
||||
autoClean.profile(),
|
||||
!autoClean.disabled()
|
||||
);
|
||||
nodeConfigurationService.save(disabled);
|
||||
disabled = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||
}
|
||||
}
|
@@ -0,0 +1,7 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||
-- to make it easier to get more information about the SSL certificate's validity
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
||||
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||
-- be crawled.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||
PRIORITY INT NOT NULL DEFAULT 0,
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
||||
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||
|
||||
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.Function;
|
||||
|
||||
@Singleton
|
||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
||||
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
private final ServiceRegistryIf serviceRegistryIf;
|
||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
private static final Executor executor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
private static final Executor offloadExecutor = useLoom
|
||||
? Executors.newVirtualThreadPerTaskExecutor()
|
||||
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||
|
||||
@Inject
|
||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||
|
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class GrpcServer {
|
||||
private final Server server;
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
|
||||
public GrpcServer(ServiceConfiguration config,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
ServicePartition partition,
|
||||
@@ -26,8 +31,13 @@ public class GrpcServer {
|
||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||
|
||||
// Start the gRPC server
|
||||
|
||||
ExecutorService workExecutor = useLoom ?
|
||||
Executors.newVirtualThreadPerTaskExecutor() :
|
||||
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||
|
||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
||||
.executor(workExecutor)
|
||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||
.channelType(NioServerSocketChannel.class);
|
||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||
// scenario
|
||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
||||
|
||||
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
|
@@ -7,6 +7,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
@@ -23,6 +24,7 @@
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
@@ -36,6 +38,16 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
@@ -8,6 +8,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
@@ -18,6 +19,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
@@ -28,6 +30,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
@@ -38,6 +41,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
@@ -57,6 +61,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -69,6 +74,16 @@
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
private final MqPersistence persistence;
|
||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||
private final ServiceRegistryIf registry;
|
||||
|
||||
@Inject
|
||||
public ExecutorClient(ServiceRegistryIf registry,
|
||||
MqPersistence persistence,
|
||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
{
|
||||
this.registry = registry;
|
||||
this.persistence = persistence;
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void startFsm(int node, String actorName) {
|
||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||
.forNode(node)
|
||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public long updateNsfwFilters() throws Exception {
|
||||
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||
|
||||
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||
.forNode(1)
|
||||
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public ActorRunStates getActorStates(int node) {
|
||||
try {
|
||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
@@ -66,6 +68,9 @@ message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcUpdateNsfwFilters {
|
||||
int64 msgId = 1;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
|
@@ -20,6 +20,7 @@ dependencies {
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
@@ -41,7 +42,6 @@ dependencies {
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.functions.execution.api.RpcFsmName;
|
||||
import nu.marginalia.functions.execution.api.RpcProcessId;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -14,18 +15,18 @@ import spark.Spark;
|
||||
@Singleton
|
||||
public class ActorApi {
|
||||
private final ExecutorActorControlService actors;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
private final MqPersistence mqPersistence;
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@Inject
|
||||
public ActorApi(ExecutorActorControlService actors,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
MqPersistence mqPersistence,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
{
|
||||
this.actors = actors;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.mqPersistence = mqPersistence;
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
}
|
||||
@@ -43,7 +44,7 @@ public class ActorApi {
|
||||
}
|
||||
|
||||
public Object stopProcess(RpcProcessId processId) {
|
||||
ProcessService.ProcessId id = ProcessService.translateExternalIdBase(processId.getProcessId());
|
||||
ProcessSpawnerService.ProcessId id = ProcessSpawnerService.translateExternalIdBase(processId.getProcessId());
|
||||
|
||||
try {
|
||||
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
||||
@@ -60,7 +61,7 @@ public class ActorApi {
|
||||
}
|
||||
|
||||
}
|
||||
processService.kill(id);
|
||||
processSpawnerService.kill(id);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to stop process {}", id, ex);
|
||||
|
@@ -6,7 +6,7 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -14,6 +14,7 @@ public enum ExecutorActor {
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
NdpMonitorActor ndpMonitorActor,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
@@ -93,7 +94,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
|
||||
@@ -112,7 +113,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -4,11 +4,14 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.*;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -24,13 +27,13 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ProcessSpawnerService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
|
||||
@@ -50,7 +53,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
@@ -92,7 +95,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
processSpawnerService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
@@ -112,13 +115,13 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
public AbstractProcessSpawnerActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
String inboxName,
|
||||
ProcessService.ProcessId processId) {
|
||||
ProcessSpawnerService.ProcessId processId) {
|
||||
super(gson);
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.inboxName = inboxName + ":" + node;
|
||||
this.processId = processId;
|
||||
}
|
||||
@@ -149,7 +152,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
processSpawnerService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class ConverterMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public ConverterMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.CONVERTER_INBOX,
|
||||
ProcessService.ProcessId.CONVERTER);
|
||||
ProcessSpawnerService.ProcessId.CONVERTER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class CrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public CrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.CRAWLER);
|
||||
ProcessSpawnerService.ProcessId.CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public ExportTaskMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||
ProcessService.ProcessId.EXPORT_TASKS);
|
||||
ProcessSpawnerService.ProcessId.EXPORT_TASKS);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class IndexConstructorMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public IndexConstructorMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
||||
ProcessService.ProcessId.INDEX_CONSTRUCTOR);
|
||||
ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -16,13 +16,13 @@ public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public LiveCrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
||||
ProcessSpawnerService.ProcessId.LIVE_CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
@@ -17,13 +17,13 @@ public class LoaderMonitorActor extends AbstractProcessSpawnerActor {
|
||||
public LoaderMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence, processService,
|
||||
persistence, processSpawnerService,
|
||||
ProcessInboxNames.LOADER_INBOX,
|
||||
ProcessService.ProcessId.LOADER);
|
||||
ProcessSpawnerService.ProcessId.LOADER);
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public NdpMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processSpawnerService,
|
||||
ProcessInboxNames.NDP_INBOX,
|
||||
ProcessSpawnerService.ProcessId.NDP);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -25,17 +25,21 @@ import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||
// but have to reimplement a lot of the same logic ourselves.
|
||||
@Singleton
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ProcessSpawnerService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final Gson gson;
|
||||
@@ -53,7 +57,6 @@ public class PingMonitorActor extends RecordActorPrototype {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest();
|
||||
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
@@ -65,7 +68,7 @@ public class PingMonitorActor extends RecordActorPrototype {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
@@ -107,7 +110,7 @@ public class PingMonitorActor extends RecordActorPrototype {
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
processSpawnerService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
@@ -127,14 +130,14 @@ public class PingMonitorActor extends RecordActorPrototype {
|
||||
public PingMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) throws SQLException {
|
||||
ProcessSpawnerService processSpawnerService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessService.ProcessId.PING;
|
||||
this.processId = ProcessSpawnerService.ProcessId.PING;
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
@@ -163,7 +166,7 @@ public class PingMonitorActor extends RecordActorPrototype {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
processSpawnerService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
|
@@ -8,7 +8,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit;
|
||||
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final ServiceEventLog eventLogService;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final int node;
|
||||
@@ -49,7 +49,7 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
var processId = heartbeat.getProcessId();
|
||||
if (null == processId) continue;
|
||||
|
||||
if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||
if (processSpawnerService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||
continue;
|
||||
|
||||
flagProcessAsStopped(heartbeat);
|
||||
@@ -72,12 +72,12 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
public ProcessLivenessMonitorActor(Gson gson,
|
||||
ServiceEventLog eventLogService,
|
||||
ServiceConfiguration configuration,
|
||||
ProcessService processService,
|
||||
ProcessSpawnerService processSpawnerService,
|
||||
HikariDataSource dataSource) {
|
||||
super(gson);
|
||||
this.node = configuration.node();
|
||||
this.eventLogService = eventLogService;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
@@ -208,8 +208,8 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||
public boolean isRunning() {
|
||||
return "RUNNING".equals(status);
|
||||
}
|
||||
public ProcessService.ProcessId getProcessId() {
|
||||
return ProcessService.translateExternalIdBase(processBase);
|
||||
public ProcessSpawnerService.ProcessId getProcessId() {
|
||||
return ProcessSpawnerService.translateExternalIdBase(processBase);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
|
||||
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Wait(String ts) implements ActorStep {}
|
||||
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
|
||||
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
|
@@ -3,11 +3,11 @@ package nu.marginalia.actor.task;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.state.ActorControlFlowException;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -20,13 +20,13 @@ public class ActorProcessWatcher {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
private final ProcessSpawnerService processSpawnerService;
|
||||
|
||||
@Inject
|
||||
public ActorProcessWatcher(MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
ProcessSpawnerService processSpawnerService) {
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.processSpawnerService = processSpawnerService;
|
||||
}
|
||||
|
||||
/** Wait for a process to start, and then wait for a response from the process,
|
||||
@@ -36,7 +36,7 @@ public class ActorProcessWatcher {
|
||||
* <p>
|
||||
* When interrupted, the process is killed and the message is marked as dead.
|
||||
*/
|
||||
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId)
|
||||
public MqMessage waitResponse(MqOutbox outbox, ProcessSpawnerService.ProcessId processId, long msgId)
|
||||
throws ActorControlFlowException, InterruptedException, SQLException
|
||||
{
|
||||
// enums values only have a single instance,
|
||||
@@ -65,7 +65,7 @@ public class ActorProcessWatcher {
|
||||
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
||||
|
||||
outbox.flagAsDead(msgId);
|
||||
processService.kill(processId);
|
||||
processSpawnerService.kill(processId);
|
||||
|
||||
logger.info("Process {} killed due to interrupt", processId);
|
||||
}
|
||||
@@ -94,12 +94,12 @@ public class ActorProcessWatcher {
|
||||
}
|
||||
|
||||
/** Wait the specified time for the specified process to start running (does not start the process) */
|
||||
private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||
private boolean waitForProcess(ProcessSpawnerService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||
|
||||
// Wait for process to start
|
||||
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||
while (System.currentTimeMillis() < deadline) {
|
||||
if (processService.isRunning(processId))
|
||||
if (processSpawnerService.isRunning(processId))
|
||||
return true;
|
||||
|
||||
TimeUnit.MILLISECONDS.sleep(100);
|
||||
|
@@ -12,7 +12,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||
import nu.marginalia.sideload.SideloadHelper;
|
||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||
@@ -218,7 +218,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
);
|
||||
}
|
||||
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Converter failed");
|
||||
|
@@ -18,7 +18,7 @@ import nu.marginalia.mqapi.index.IndexName;
|
||||
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -95,7 +95,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
||||
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Converter failed");
|
||||
@@ -129,7 +129,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
yield new Load(processedIds, id);
|
||||
}
|
||||
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessSpawnerService.ProcessId.LOADER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Loader failed");
|
||||
@@ -165,7 +165,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
||||
case ReindexFwd(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Forward index construction failed");
|
||||
@@ -174,7 +174,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
||||
case ReindexFull(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Full index construction failed");
|
||||
@@ -183,7 +183,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
}
|
||||
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
||||
case ReindexPrio(long id) -> {
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK)
|
||||
yield new Error("Prio index construction failed");
|
||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@@ -76,7 +76,7 @@ public class CrawlActor extends RecordActorPrototype {
|
||||
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
||||
var rsp = processWatcher.waitResponse(
|
||||
mqCrawlerOutbox,
|
||||
ProcessService.ProcessId.CRAWLER,
|
||||
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||
msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -55,7 +55,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -54,7 +54,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -52,7 +52,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -52,7 +52,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
logger.info("{}", self);
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
yield new Monitor("-");
|
||||
@@ -75,7 +74,7 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
yield new LiveCrawl(feedsHash, id);
|
||||
}
|
||||
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessSpawnerService.ProcessId.LIVE_CRAWLER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Crawler failed");
|
||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
@@ -51,7 +51,7 @@ public class RecrawlSingleDomainActor extends RecordActorPrototype {
|
||||
case Crawl (long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(
|
||||
mqCrawlerOutbox,
|
||||
ProcessService.ProcessId.CRAWLER,
|
||||
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||
msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.process.ProcessSpawnerService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -34,7 +34,7 @@ public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
||||
yield new Run(newMsgId);
|
||||
}
|
||||
case Run(long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Exporter failed");
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@@ -12,23 +14,26 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
public record Initial(long respondMsgId) implements ActorStep {}
|
||||
public record Run(long respondMsgId) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
case Initial(long respondMsgId) -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
yield new Run(respondMsgId);
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
case Run(long respondMsgId) -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
@@ -43,11 +48,13 @@ public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
@@ -263,4 +264,19 @@ public class ExecutorGrpcService
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||
logger.info("Got request {}", request);
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@@ -28,7 +29,7 @@ import java.util.List;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
@Singleton
|
||||
public class ProcessService {
|
||||
public class ProcessSpawnerService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
||||
|
||||
@@ -57,6 +58,7 @@ public class ProcessService {
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
|
||||
@@ -72,6 +74,7 @@ public class ProcessService {
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case NDP -> "NDP_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
@@ -85,7 +88,7 @@ public class ProcessService {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public ProcessService(BaseServiceParams params) {
|
||||
public ProcessSpawnerService(BaseServiceParams params) {
|
||||
this.eventLog = params.eventLog;
|
||||
this.node = params.configuration.node();
|
||||
}
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
||||
@Singleton
|
||||
public class DomainInfoClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -23,7 +24,9 @@ import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedsClient {
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||
private final MqOutbox updateFeedsOutbox;
|
||||
|
||||
@@ -59,6 +62,11 @@ public class FeedsClient {
|
||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||
}
|
||||
|
||||
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||
return channelPool.awaitChannel(duration);
|
||||
}
|
||||
|
||||
|
||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||
public String getFeedDataHash() {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||
|
@@ -35,6 +35,7 @@ dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
implementation libs.httpclient
|
||||
implementation libs.wiremock
|
||||
|
||||
implementation libs.prometheus
|
||||
|
@@ -20,19 +20,36 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@@ -55,6 +72,8 @@ public class FeedFetcherService {
|
||||
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final HttpClient httpClient;
|
||||
|
||||
private volatile boolean updating;
|
||||
|
||||
@Inject
|
||||
@@ -71,6 +90,83 @@ public class FeedFetcherService {
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
httpClient = HttpClients.custom()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.setConnectionManager(connectionManager)
|
||||
.setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setConnectionManager(connectionManager)
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -86,13 +182,7 @@ public class FeedFetcherService {
|
||||
|
||||
|
||||
try (FeedDbWriter writer = feedDb.createWriter();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.connectTimeout(Duration.ofSeconds(15))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||
ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
FeedJournal feedJournal = FeedJournal.create();
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||
) {
|
||||
@@ -137,7 +227,8 @@ public class FeedFetcherService {
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
}
|
||||
@@ -216,7 +307,6 @@ public class FeedFetcherService {
|
||||
}
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||
HttpClient client,
|
||||
ExecutorService executorService,
|
||||
@Nullable String ifModifiedSinceDate,
|
||||
@Nullable String ifNoneMatchTag)
|
||||
@@ -224,59 +314,63 @@ public class FeedFetcherService {
|
||||
try {
|
||||
URI uri = new URI(feed.feedUrl());
|
||||
|
||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(uri)
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
var requestBuilder = ClassicRequestBuilder.get(uri)
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.setHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
return httpClient.execute(requestBuilder.build(), rsp -> {
|
||||
try {
|
||||
logger.info("Code: {}, URL: {}", rsp.getCode(), uri);
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
switch (rsp.getCode()) {
|
||||
case 200 -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return new FetchResult.TransientError(); // No content to read, treat as transient error
|
||||
}
|
||||
byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
// Decode the response body based on the Content-Type header
|
||||
Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
String contentType = contentTypeHeader.getValue();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||
* its support for timeouts only applies to the time until response starts to be received,
|
||||
* and does not catch the case when the server starts to send data but then hangs.
|
||||
*/
|
||||
HttpResponse<byte[]> rs = executorService.submit(
|
||||
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||
.get(15, TimeUnit.SECONDS);
|
||||
// Grab the ETag header if it exists
|
||||
Header etagHeader = rsp.getFirstHeader("ETag");
|
||||
String newEtagValue = etagHeader == null ? null : etagHeader.getValue();
|
||||
|
||||
if (rs.statusCode() == 429) { // Too Many Requests
|
||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||
continue;
|
||||
}
|
||||
|
||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
||||
|
||||
return switch (rs.statusCode()) {
|
||||
case 200 -> {
|
||||
byte[] responseData = getResponseData(rs);
|
||||
|
||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
||||
return new FetchResult.Success(bodyText, newEtagValue);
|
||||
}
|
||||
case 304 -> {
|
||||
return new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
}
|
||||
case 404 -> {
|
||||
return new FetchResult.PermanentError(); // never try again
|
||||
}
|
||||
default -> {
|
||||
return new FetchResult.TransientError(); // we try again later
|
||||
}
|
||||
}
|
||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
||||
default -> new FetchResult.TransientError(); // we try again later
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new FetchResult.PermanentError(); // treat as permanent error
|
||||
}
|
||||
finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.debug("Error fetching feed", ex);
|
||||
@@ -285,19 +379,6 @@ public class FeedFetcherService {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
public sealed interface FetchResult {
|
||||
record Success(String value, String etag) implements FetchResult {}
|
||||
record NotModified() implements FetchResult {}
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.name.Names;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.rss.db.FeedDb;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
@@ -82,6 +84,7 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
||||
|
@@ -26,7 +26,9 @@ public class MathClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||
|
||||
@Inject
|
||||
public MathClient(GrpcChannelPoolFactory factory) {
|
||||
|
@@ -38,7 +38,9 @@ public class IndexClient {
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
@@ -9,6 +10,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class LocalDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
@@ -13,11 +14,14 @@ import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, InterProcessSemaphoreV2> locks = new ConcurrentHashMap<>();
|
||||
private final Map<String, Integer> waitCounts = new ConcurrentHashMap<>();
|
||||
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final int nodeId;
|
||||
|
||||
@@ -32,27 +36,35 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::createSemapore);
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
var lease = sem.acquire();
|
||||
|
||||
return new ZkDomainLock(sem, lease);
|
||||
return new ZkDomainLock(sem, sem.acquire());
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to acquire lock for domain: " + domain.topDomain, e);
|
||||
}
|
||||
finally {
|
||||
// Decrement or remove the wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return tryLockDomain(domain, Duration.ofSeconds(1)); // Underlying semaphore doesn't have a tryLock method, so we use a short timeout
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
var lease = sem.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); // Acquire with timeout
|
||||
if (lease != null) {
|
||||
@@ -65,6 +77,9 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
catch (Exception e) {
|
||||
return Optional.empty(); // If we fail to acquire the lock, we return an empty optional
|
||||
}
|
||||
finally {
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private InterProcessSemaphoreV2 createSemapore(String topDomain){
|
||||
@@ -81,7 +96,7 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
return true; // Curator does not provide a way to check if a lock is available without acquiring it
|
||||
return !waitCounts.containsKey(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static class ZkDomainLock implements DomainLock {
|
||||
|
@@ -19,6 +19,8 @@ import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
@@ -37,6 +39,7 @@ public class DocumentProcessor {
|
||||
"text/plain",
|
||||
"application/pdf");
|
||||
|
||||
private final Marker converterAuditMarker = MarkerFactory.getMarker("CONVERTER");
|
||||
|
||||
private final List<AbstractDocumentProcessorPlugin> processorPlugins = new ArrayList<>();
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
@@ -81,12 +84,13 @@ public class DocumentProcessor {
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = ex.reason.toString();
|
||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||
logger.info(converterAuditMarker, "Disqualified {}: {}", ret.url, ex.reason);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||
logger.info(converterAuditMarker, "Failed to convert {}: {}", crawledDocument.url, ex.getClass().getSimpleName());
|
||||
logger.warn(converterAuditMarker, "Failed to convert " + crawledDocument.url, ex);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.converting.processor.logic;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
@Singleton
|
||||
@@ -26,12 +25,9 @@ public class DocumentLengthLogic {
|
||||
return (int) Math.round((totalWords / (double) numSentences) / 4.);
|
||||
}
|
||||
|
||||
public void validateLength(DocumentLanguageData dld,
|
||||
double modifier) throws DisqualifiedException
|
||||
public boolean validateLength(DocumentLanguageData dld, double modifier)
|
||||
{
|
||||
if (modifier * dld.totalNumWords() < minDocumentLength) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
return modifier * dld.totalNumWords() >= minDocumentLength;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -68,6 +68,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final HtmlProcessorSpecializations htmlProcessorSpecializations;
|
||||
|
||||
private static final int MAX_DOCUMENT_LENGTH_BYTES = Integer.getInteger("converter.max-body-length",128_000);
|
||||
private static boolean lenientProcessing = Boolean.getBoolean("converter.lenientProcessing");
|
||||
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@@ -108,13 +109,13 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException, IOException {
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) {
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
Document doc = crawledDocument.parseBody();
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
if (!lenientProcessing && AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
@@ -129,25 +130,27 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final var specialization = htmlProcessorSpecializations.select(generatorParts, url);
|
||||
|
||||
if (!specialization.shouldIndex(url)) {
|
||||
if (!lenientProcessing && !specialization.shouldIndex(url)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.IRRELEVANT);
|
||||
}
|
||||
|
||||
var prunedDoc = specialization.prune(doc);
|
||||
|
||||
|
||||
final int length = getLength(doc);
|
||||
final DocumentFormat format = getDocumentFormat(doc);
|
||||
final double quality = documentValuator.getQuality(crawledDocument, format, doc, length);
|
||||
|
||||
if (isDisqualified(documentClass, url, quality, doc.title())) {
|
||||
if (!lenientProcessing && isDisqualified(documentClass, url, quality, doc.title())) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
|
||||
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier())) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
|
@@ -43,6 +43,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final DefaultSpecialization defaultSpecialization;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PdfDocumentProcessorPlugin.class);
|
||||
private static boolean lenientProcessing = Boolean.getBoolean("converter.lenientProcessing");
|
||||
|
||||
@Inject
|
||||
public PdfDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
@@ -81,7 +82,7 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
@@ -100,7 +101,9 @@ public class PdfDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
documentLengthLogic.validateLength(dld, 1.0);
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
|
@@ -37,6 +37,8 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
private static boolean lenientProcessing = Boolean.getBoolean("converter.lenientProcessing");
|
||||
|
||||
|
||||
@Inject
|
||||
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
@@ -73,7 +75,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
String documentBody = crawledDocument.documentBody();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
if (!lenientProcessing && languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
@@ -83,7 +85,9 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
documentLengthLogic.validateLength(dld, 1.0);
|
||||
if (!lenientProcessing && !documentLengthLogic.validateLength(dld, 1.0)) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
|
@@ -36,7 +36,6 @@ import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.ssl.SSLContextBuilder;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -49,15 +48,12 @@ import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
@@ -99,42 +95,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
// No-op up front validation of server certificates.
|
||||
//
|
||||
// We will validate certificates later, after the connection is established
|
||||
// as we want to store the certificate chain and validation
|
||||
// outcome to the database.
|
||||
|
||||
var trustMeBro = new X509TrustManager() {
|
||||
private X509Certificate[] lastServerCertChain;
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) {
|
||||
this.lastServerCertChain = chain.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return new X509Certificate[0];
|
||||
}
|
||||
|
||||
public X509Certificate[] getLastServerCertChain() {
|
||||
return lastServerCertChain != null ? lastServerCertChain.clone() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SSLContext sslContext = SSLContextBuilder.create().build();
|
||||
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(sslContext))
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
|
@@ -115,9 +115,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
if (!robotsRules.isAllowed(probedUrl.toString())) {
|
||||
warcRecorder.flagAsRobotsTxtError(probedUrl);
|
||||
yield 1; // Nothing we can do here, we aren't allowed to fetch the root URL
|
||||
}
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, robotsRules, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
@@ -270,11 +274,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, SimpleRobotRules robotsRules, CrawlDelayTimer timer) {
|
||||
Optional<String> feedLink = Optional.empty();
|
||||
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
EdgeUrl url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
@@ -331,7 +335,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
if (feedLink.isEmpty()) {
|
||||
feedLink = guessFeedUrl(timer);
|
||||
feedLink = guessFeedUrl(timer, robotsRules);
|
||||
}
|
||||
|
||||
// Download the sitemap if available
|
||||
@@ -339,14 +343,18 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
if (robotsRules.isAllowed(faviconUrl.toString())) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED)
|
||||
instanceof HttpFetchResult.ResultOk iconResult)
|
||||
{
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
}
|
||||
}
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
@@ -383,7 +391,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
"blog/rss"
|
||||
);
|
||||
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer, SimpleRobotRules robotsRules) throws InterruptedException {
|
||||
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||
|
||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||
@@ -396,6 +404,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
for (String endpoint : likelyFeedEndpoints) {
|
||||
String url = "https://" + domain + "/" + endpoint;
|
||||
if (!robotsRules.isAllowed(url)) {
|
||||
continue;
|
||||
}
|
||||
if (validateFeedUrl(url, timer)) {
|
||||
return Optional.of(url);
|
||||
}
|
||||
|
@@ -28,6 +28,8 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
@Nullable
|
||||
public String headers;
|
||||
|
||||
private static int MAX_LENGTH_BYTES = 500_000;
|
||||
|
||||
public String documentBody() {
|
||||
return DocumentBodyToString.getStringData(
|
||||
ContentType.parse(contentType),
|
||||
@@ -65,7 +67,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
return DocumentBodyToString.getParsedData(
|
||||
ContentType.parse(contentType),
|
||||
documentBodyBytes,
|
||||
200_000,
|
||||
MAX_LENGTH_BYTES,
|
||||
url);
|
||||
}
|
||||
|
||||
|
@@ -50,6 +50,7 @@ dependencies {
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation libs.httpclient
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
@@ -15,6 +15,7 @@ import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||
import nu.marginalia.loading.documents.KeywordLoaderService;
|
||||
@@ -32,12 +33,15 @@ import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.Security;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.HashMap;
|
||||
@@ -74,7 +78,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
DomainProcessor domainProcessor,
|
||||
FileStorageService fileStorageService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
DocumentLoaderService documentLoaderService, DomainCoordinator domainCoordinator, HikariDataSource dataSource)
|
||||
DocumentLoaderService documentLoaderService,
|
||||
DomainCoordinator domainCoordinator,
|
||||
HikariDataSource dataSource)
|
||||
throws Exception
|
||||
{
|
||||
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||
@@ -148,7 +154,10 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
private void run() throws Exception {
|
||||
Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data");
|
||||
Path basePath = fileStorageService
|
||||
.getStorageBase(FileStorageBaseType.STORAGE)
|
||||
.asPath()
|
||||
.resolve("live-crawl-data");
|
||||
|
||||
if (!Files.isDirectory(basePath)) {
|
||||
Files.createDirectories(basePath);
|
||||
@@ -163,21 +172,38 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
{
|
||||
final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS);
|
||||
|
||||
/* ------------------------------------------------ */
|
||||
/* Fetch the latest domains from the feeds database */
|
||||
/* ------------------------------------------------ */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.FETCH_LINKS);
|
||||
|
||||
Map<String, List<String>> urlsPerDomain = new HashMap<>(10_000);
|
||||
if (!feedsClient.waitReady(Duration.ofHours(1))) {
|
||||
throw new RuntimeException("Feeds client never became ready, cannot proceed with live crawling");
|
||||
}
|
||||
feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put);
|
||||
|
||||
logger.info("Fetched data for {} domains", urlsPerDomain.size());
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Prune the database from old entries */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PRUNE_DB);
|
||||
|
||||
// Remove data that is too old
|
||||
dataSet.prune(cutoff);
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Fetch the links for each domain */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.CRAWLING);
|
||||
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, domainBlacklist);
|
||||
CloseableHttpClient client = HttpClientProvider.createClient();
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, client, domainBlacklist);
|
||||
var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
|
||||
{
|
||||
for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
|
||||
@@ -190,18 +216,29 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
fetcher.scheduleRetrieval(domain, urls);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
client.close(CloseMode.GRACEFUL);
|
||||
}
|
||||
|
||||
Path tempPath = dataSet.createWorkDir();
|
||||
|
||||
|
||||
try {
|
||||
/* ------------------------------------- */
|
||||
/* Process the fetched links */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PROCESSING);
|
||||
|
||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing");
|
||||
var writer = new ConverterBatchWriter(tempPath, 0)
|
||||
) {
|
||||
// Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the
|
||||
// main indexes (the maximum permissible for doc ordinal is value is 67_108_863, so this
|
||||
// leaves us with a lot of headroom still)
|
||||
// We need unique document ids that do not collide with the document id from the main index,
|
||||
// so we offset the documents' ordinals toward the upper range.
|
||||
//
|
||||
// The maximum permissible for doc ordinal is value is 67_108_863,
|
||||
// so this leaves us with a lot of headroom still!
|
||||
// Expected document count here is order of 10 :^)
|
||||
writer.setOrdinalOffset(67_000_000);
|
||||
|
||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||
@@ -209,10 +246,15 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ---------------------------------------------- */
|
||||
/* Load the processed data into the link database */
|
||||
/* and construct an index journal for the docs */
|
||||
/* ---------------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.LOADING);
|
||||
|
||||
LoaderInputData lid = new LoaderInputData(tempPath, 1);
|
||||
|
||||
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
|
||||
|
||||
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
|
||||
@@ -224,9 +266,16 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
FileUtils.deleteDirectory(tempPath.toFile());
|
||||
}
|
||||
|
||||
// Construct the index
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Finish up */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.DONE);
|
||||
|
||||
// After we return from here, the LiveCrawlActor will trigger an index construction
|
||||
// job. Unlike all the stuff we did in this process, it's identical to the real job
|
||||
// so we don't need to do anything special from this process
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -7,7 +7,6 @@ import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
@@ -15,24 +14,21 @@ import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||
@@ -45,20 +41,21 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
private final LiveCrawlDataSet dataSet;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
private final HttpClient httpClient;
|
||||
|
||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||
DomainCoordinator domainCoordinator,
|
||||
DbDomainQueries domainQueries,
|
||||
HttpClient httpClient,
|
||||
DomainBlacklist domainBlacklist) {
|
||||
this.dataSet = dataSet;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.domainQueries = domainQueries;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.httpClient = httpClient;
|
||||
}
|
||||
|
||||
public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
|
||||
@@ -75,17 +72,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>();
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>(Math.max(1, urls.size()));
|
||||
|
||||
// Resolve absolute URLs
|
||||
for (var url : urls) {
|
||||
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||
if (optParsedUrl.isEmpty()) {
|
||||
|
||||
if (optParsedUrl.isEmpty())
|
||||
continue;
|
||||
}
|
||||
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||
continue;
|
||||
}
|
||||
relevantUrls.add(optParsedUrl.get());
|
||||
|
||||
EdgeUrl absoluteUrl = optParsedUrl.get();
|
||||
|
||||
if (!dataSet.hasUrl(absoluteUrl))
|
||||
relevantUrls.add(absoluteUrl);
|
||||
}
|
||||
|
||||
if (relevantUrls.isEmpty()) {
|
||||
@@ -94,16 +93,10 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
int fetched = 0;
|
||||
|
||||
try (HttpClient client = HttpClient
|
||||
.newBuilder()
|
||||
.connectTimeout(connectTimeout)
|
||||
.followRedirects(HttpClient.Redirect.NEVER)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
try (// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLock lock = domainCoordinator.lockDomain(domain)
|
||||
) {
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl);
|
||||
|
||||
if (rules == null) { // I/O error fetching robots.txt
|
||||
// If we can't fetch the robots.txt,
|
||||
@@ -116,18 +109,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
||||
|
||||
for (var parsedUrl : relevantUrls) {
|
||||
|
||||
if (!rules.isAllowed(parsedUrl.toString())) {
|
||||
maybeFlagAsBad(parsedUrl);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||
switch (fetchUrl(domainId, parsedUrl, timer)) {
|
||||
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
|
||||
dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||
fetched++;
|
||||
}
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> {
|
||||
maybeFlagAsBad(docUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,111 +144,107 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
|
||||
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept-Encoding","gzip")
|
||||
.timeout(readTimeout);
|
||||
|
||||
// Fetch the robots.txt
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl) throws URISyntaxException {
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
if (robotsTxt.statusCode() == 200) {
|
||||
return parser.parseContent(rootUrl.toString(),
|
||||
getResponseData(robotsTxt),
|
||||
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||
WmsaHome.getUserAgent().uaIdentifier());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
var contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return null; // No content type header, can't parse
|
||||
}
|
||||
return new SimpleRobotRulesParser().parseContent(
|
||||
rootUrl.toString(),
|
||||
EntityUtils.toByteArray(rsp.getEntity()),
|
||||
contentTypeHeader.getValue(),
|
||||
WmsaHome.getUserAgent().uaIdentifier()
|
||||
);
|
||||
} else if (rsp.getCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
}
|
||||
} finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
return null;
|
||||
});
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching robots.txt for {}: {}", rootUrl, e.getMessage());
|
||||
return null; // I/O error fetching robots.txt
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
else if (robotsTxt.statusCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Fetch a URL and store it in the database
|
||||
*/
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer) throws Exception {
|
||||
|
||||
timer.waitFetchDelay();
|
||||
|
||||
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept", "text/html")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(readTimeout)
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(parsedUrl.asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept", "text/html")
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
String contentType = rsp.getFirstHeader("Content-Type").getValue();
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
// Handle rate limiting by waiting and retrying once
|
||||
if (response.statusCode() == 429) {
|
||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||
response.headers().firstValue("Retry-After").orElse("5")
|
||||
));
|
||||
response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
}
|
||||
byte[] body = EntityUtils.toByteArray(rsp.getEntity(), MAX_SIZE);
|
||||
|
||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
if (response.statusCode() == 200) {
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
for (var header : rsp.getHeaders()) {
|
||||
headersStr.append(header.getName()).append(": ").append(header.getValue()).append("\n");
|
||||
}
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersStr.toString());
|
||||
}
|
||||
} finally {
|
||||
if (rsp.getEntity() != null) {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
}
|
||||
|
||||
byte[] body = getResponseData(response);
|
||||
if (body.length > MAX_SIZE) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
|
||||
}
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
});
|
||||
}
|
||||
catch (IOException ex) {
|
||||
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
||||
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching {}: {}", parsedUrl, e.getMessage());
|
||||
// If we can't fetch the URL, we return an error result
|
||||
// so that the caller can decide what to do with it.
|
||||
}
|
||||
finally {
|
||||
timer.waitFetchDelay();
|
||||
}
|
||||
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
sealed interface FetchResult {
|
||||
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||
record Error(EdgeUrl url) implements FetchResult {}
|
||||
}
|
||||
|
||||
private String headersToString(HttpHeaders headers) {
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
headers.map().forEach((k, v) -> {
|
||||
headersStr.append(k).append(": ").append(v).append("\n");
|
||||
});
|
||||
return headersStr.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
pool.shutDown();
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -3,10 +3,13 @@ package nu.marginalia.livecrawler;
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -16,29 +19,34 @@ import org.mockito.Mockito;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
class SimpleLinkScraperTest {
|
||||
private Path tempDir;
|
||||
private LiveCrawlDataSet dataSet;
|
||||
private CloseableHttpClient httpClient;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, SQLException {
|
||||
public void setUp() throws IOException, SQLException, NoSuchAlgorithmException, KeyManagementException {
|
||||
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
dataSet = new LiveCrawlDataSet(tempDir);
|
||||
httpClient = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
dataSet.close();
|
||||
httpClient.close(CloseMode.IMMEDIATE);
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetrieveNow() throws Exception {
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(), null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(), null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(1, fetched);
|
||||
|
||||
@@ -58,7 +66,7 @@ class SimpleLinkScraperTest {
|
||||
@Test
|
||||
public void testRetrieveNow_Redundant() throws Exception {
|
||||
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
|
||||
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
|
@@ -40,6 +40,8 @@ public class LoaderMain extends ProcessMainClass {
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
public static void main(String... args) {
|
||||
try {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
@@ -99,14 +101,29 @@ public class LoaderMain extends ProcessMainClass {
|
||||
|
||||
try {
|
||||
var results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
.invokeAll(List.of());
|
||||
|
||||
if ( true == insertFoundDomains ) {
|
||||
results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> linksService.loadLinks(domainIdRegistry, heartbeat, inputData),
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
}
|
||||
else {
|
||||
results = ForkJoinPool.commonPool()
|
||||
.invokeAll(
|
||||
List.of(
|
||||
() -> keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, inputData),
|
||||
() -> documentLoaderService.loadDocuments(domainIdRegistry, heartbeat, inputData),
|
||||
() -> domainService.loadDomainMetadata(domainIdRegistry, heartbeat, inputData)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
for (var result : results) {
|
||||
if (result.state() == Future.State.FAILED) {
|
||||
|
@@ -25,6 +25,8 @@ import java.util.Set;
|
||||
@Singleton
|
||||
public class DomainLoaderService {
|
||||
|
||||
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(DomainLoaderService.class);
|
||||
private final int nodeId;
|
||||
@@ -84,25 +86,34 @@ public class DomainLoaderService {
|
||||
|
||||
// Add domains that are linked to from the domains we've just crawled, but with -1 affinity meaning they
|
||||
// can be grabbed by any index node
|
||||
try (var inserter = new DomainInserter(conn, -1);
|
||||
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
|
||||
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
|
||||
int pageIdx = 0;
|
||||
if ( true == insertFoundDomains ) {
|
||||
logger.info("Adding found domains");
|
||||
|
||||
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
|
||||
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
|
||||
try (var inserter = new DomainInserter(conn, -1);
|
||||
var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) {
|
||||
// Add linked domains, but with -1 affinity meaning they can be grabbed by any index node
|
||||
int pageIdx = 0;
|
||||
|
||||
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
|
||||
while (reader.hasMore()) {
|
||||
SlopDomainLinkRecord record = reader.next();
|
||||
String domainName = record.dest();
|
||||
if (domainNamesAll.add(domainName)) {
|
||||
inserter.accept(new EdgeDomain(domainName));
|
||||
for (SlopTable.Ref<SlopDomainLinkRecord> page : inputData.listDomainLinkPages()) {
|
||||
processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size());
|
||||
|
||||
try (var reader = new SlopDomainLinkRecord.Reader(page)) {
|
||||
while (reader.hasMore()) {
|
||||
SlopDomainLinkRecord record = reader.next();
|
||||
String domainName = record.dest();
|
||||
if (domainNamesAll.add(domainName)) {
|
||||
inserter.accept(new EdgeDomain(domainName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.info("Skipping found domains");
|
||||
}
|
||||
|
||||
|
||||
taskHeartbeat.progress(Steps.UPDATE_AFFINITY_AND_IP);
|
||||
|
||||
|
12
code/processes/new-domain-process/README.md
Normal file
12
code/processes/new-domain-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The new domain process (NDP) is a process that evaluates new domains for
|
||||
inclusion in the search engine index.
|
||||
|
||||
It visits the root document of each candidate domain, ensures that it's reachable,
|
||||
verifies that the response is valid HTML, and checks for a few factors such as length
|
||||
and links before deciding whether to assign the domain to a node.
|
||||
|
||||
The NDP process will assign new domains to the node with the fewest assigned domains.
|
||||
|
||||
The NDP process is triggered with a goal target number of domains to process, and
|
||||
will find domains until that target is reached. If e.g. a goal of 100 is set,
|
||||
and 50 are in the index, it will find 50 more domains.
|
75
code/processes/new-domain-process/build.gradle
Normal file
75
code/processes/new-domain-process/build.gradle
Normal file
@@ -0,0 +1,75 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,146 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.ndp.io.HttpClientProvider;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Evaluates a domain to determine if it is worth indexing.
|
||||
* This class fetches the root document, checks the response code, content type,
|
||||
* and parses the HTML to ensure it smells alright.
|
||||
*/
|
||||
@Singleton
|
||||
public class DomainEvaluator {
|
||||
private final HttpClient client;
|
||||
private final String userAgentString = WmsaHome.getUserAgent().uaString();
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
@Inject
|
||||
public DomainEvaluator(DomainCoordinator domainCoordinator) throws NoSuchAlgorithmException, KeyManagementException {
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
client = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
public boolean evaluateDomain(String domainName) {
|
||||
var edgeDomain = new EdgeDomain(domainName);
|
||||
|
||||
// Grab a lock on the domain to prevent concurrent evaluations between processes
|
||||
try (var lock = domainCoordinator.lockDomain(edgeDomain)) {
|
||||
var rootUrl = edgeDomain.toRootUrlHttps();
|
||||
|
||||
var request = ClassicRequestBuilder.get(rootUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
|
||||
.build();
|
||||
|
||||
return client.execute(request, (rsp) -> {
|
||||
if (rsp.getEntity() == null)
|
||||
return false;
|
||||
|
||||
try {
|
||||
// Check if the response code indicates a successful fetch
|
||||
if (200 != rsp.getCode()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
byte[] content;
|
||||
// Read the content from the response entity
|
||||
try (InputStream contentStream = rsp.getEntity().getContent()) {
|
||||
content = contentStream.readNBytes(8192);
|
||||
}
|
||||
|
||||
// Parse the content (if it's valid)
|
||||
ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
|
||||
|
||||
// Validate the content type
|
||||
if (!contentType.contentType().startsWith("text/html") && !contentType.contentType().startsWith("application/xhtml+xml"))
|
||||
return false;
|
||||
|
||||
// Parse the document body to a Jsoup Document
|
||||
final Document document = Jsoup.parse(DocumentBodyToString.getStringData(contentType, content));
|
||||
final String text = document.body().text();
|
||||
|
||||
if (text.length() < 100)
|
||||
return false;
|
||||
if (text.contains("404 Not Found") || text.contains("Page not found"))
|
||||
return false;
|
||||
if (hasMetaRefresh(document))
|
||||
return false; // This almost always indicates a parked domain
|
||||
if (!hasInternalLink(document, edgeDomain, rootUrl))
|
||||
return false; // No internal links means it's not worth indexing
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
finally {
|
||||
// May or may not be necessary, but let's ensure we clean up the response entity
|
||||
// to avoid resource leaks
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
|
||||
// Sleep for a while before yielding the lock, to avoid immediately hammering the domain
|
||||
// from another process
|
||||
sleepQuietly(Duration.ofSeconds(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return false; // If we fail to fetch or parse the domain, we consider it invalid
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasInternalLink(Document document, EdgeDomain currentDomain, EdgeUrl rootUrl) {
|
||||
for (Element atag : document.select("a")) {
|
||||
Optional<EdgeDomain> destDomain = linkParser
|
||||
.parseLink(rootUrl, atag)
|
||||
.map(EdgeUrl::getDomain);
|
||||
|
||||
if (destDomain.isPresent() && Objects.equals(currentDomain, destDomain.get()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasMetaRefresh(Document document) {
|
||||
for (Element metaTag : document.select("meta")) {
|
||||
if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv")))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void sleepQuietly(Duration duration) {
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(duration.toMillis());
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,134 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
|
||||
/** DomainAllocator is responsible for assigning domains to partitions/nodes.
|
||||
* This is ensured to make sure that domains are evenly distributed across the nodes.
|
||||
*/
|
||||
public class DomainNodeAllocator {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
|
||||
|
||||
private volatile boolean initialized = false;
|
||||
|
||||
private record NodeCount(int nodeId, int count)
|
||||
implements Comparable<NodeCount>
|
||||
{
|
||||
public NodeCount incrementCount() {
|
||||
return new NodeCount(nodeId, count + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull DomainNodeAllocator.NodeCount o) {
|
||||
return Integer.compare(this.count, o.count);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainNodeAllocator(NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainNodeAllocator::initialize()")
|
||||
.start(this::initialize);
|
||||
}
|
||||
|
||||
public synchronized int totalCount() {
|
||||
ensureInitialized();
|
||||
return countPerNode.stream().mapToInt(NodeCount::count).sum();
|
||||
}
|
||||
|
||||
/** Returns the next node ID to assign a domain to.
|
||||
* This method is synchronized to ensure thread safety when multiple threads are allocating domains.
|
||||
* The node ID returned is guaranteed to be one of the viable nodes configured in the system.
|
||||
*/
|
||||
public synchronized int nextNodeId() {
|
||||
ensureInitialized();
|
||||
|
||||
// Synchronized is fine here as this is not a hot path
|
||||
// (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
|
||||
|
||||
NodeCount allocation = countPerNode.remove();
|
||||
countPerNode.add(allocation.incrementCount());
|
||||
return allocation.nodeId();
|
||||
}
|
||||
|
||||
|
||||
private void ensureInitialized() {
|
||||
if (initialized) return;
|
||||
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
try {
|
||||
// Wait until the initialization is complete
|
||||
this.wait(1000);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("DomainAllocator initialization interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void initialize() {
|
||||
if (initialized) return;
|
||||
|
||||
Set<Integer> viableNodes = new HashSet<>();
|
||||
|
||||
// Find all viable nodes that can handle batch crawls
|
||||
for (var node : nodeConfigurationService.getAll()) {
|
||||
if (node.disabled())
|
||||
continue;
|
||||
if (!node.autoAssignDomains())
|
||||
continue;
|
||||
|
||||
if (node.profile().permitBatchCrawl())
|
||||
viableNodes.add(node.node());
|
||||
}
|
||||
|
||||
// Fetch the current counts of domains per node from the database
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT COUNT(*) AS CNT, NODE_AFFINITY
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
GROUP BY NODE_AFFINITY
|
||||
"""))
|
||||
{
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
|
||||
int nodeId = rs.getInt("NODE_AFFINITY");
|
||||
int count = rs.getInt("CNT");
|
||||
|
||||
if (viableNodes.remove(nodeId)) {
|
||||
countPerNode.add(new NodeCount(nodeId, count));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load domain counts from database", e);
|
||||
}
|
||||
|
||||
// Add any remaining viable nodes that were not found in the database
|
||||
for (int nodeId : viableNodes) {
|
||||
countPerNode.add(new NodeCount(nodeId, 0));
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,240 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class DomainTestingQueue {
|
||||
private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);
|
||||
|
||||
private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(2);
|
||||
|
||||
// This will grow quite large, but should be manageable in memory, as theoretical maximum is around 100M domains,
|
||||
// order of 2 GB in memory.
|
||||
private final ConcurrentHashMap<String, Boolean> takenDomains = new ConcurrentHashMap<>();
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainTestingQueue(HikariDataSource dataSource,
|
||||
AggregateLinkGraphClient linkGraphClient
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainTestingQueue::fetch()")
|
||||
.start(this::fetch);
|
||||
}
|
||||
|
||||
public DomainToTest next() throws InterruptedException {
|
||||
return queue.take();
|
||||
}
|
||||
|
||||
public void accept(DomainToTest domain, int nodeId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var flagOkStmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='ACCEPTED'
|
||||
WHERE DOMAIN_ID=?
|
||||
""");
|
||||
var assignNodeStmt = conn.prepareStatement("""
|
||||
UPDATE EC_DOMAIN SET NODE_AFFINITY=?
|
||||
WHERE ID=?
|
||||
AND EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
flagOkStmt.setInt(1, domain.domainId());
|
||||
flagOkStmt.executeUpdate();
|
||||
|
||||
assignNodeStmt.setInt(1, nodeId);
|
||||
assignNodeStmt.setInt(2, domain.domainId());
|
||||
assignNodeStmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to accept domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void reject(DomainToTest domain) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='REJECTED', CHECK_COUNT=CHECK_COUNT + 1
|
||||
WHERE DOMAIN_ID=?
|
||||
"""))
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
stmt.setInt(1, domain.domainId());
|
||||
stmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to reject domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void fetch() {
|
||||
while (true) {
|
||||
List<DomainToTest> domains = new ArrayList<>(2000);
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM NDP_NEW_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON ID=DOMAIN_ID
|
||||
WHERE NDP_NEW_DOMAINS.STATE = 'NEW'
|
||||
ORDER BY PRIORITY DESC
|
||||
LIMIT 2000
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
if (takenDomains.put(domainName, true) != null) {
|
||||
logger.warn("Domain {} is already processed, skipping", domainName);
|
||||
continue; // Skip if already taken
|
||||
}
|
||||
domains.add(new DomainToTest(domainName, domainId));
|
||||
}
|
||||
|
||||
if (domains.isEmpty()) {
|
||||
if (!refreshQueue(conn)) {
|
||||
throw new RuntimeException("No new domains found, aborting!");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (RuntimeException e) {
|
||||
throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to fetch domains from database", e);
|
||||
}
|
||||
|
||||
try {
|
||||
for (var domain : domains) {
|
||||
queue.put(domain);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("Domain fetching interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean refreshQueue(Connection conn) {
|
||||
logger.info("Refreshing domain queue in database");
|
||||
|
||||
Int2IntMap domainIdToCount = new Int2IntOpenHashMap();
|
||||
|
||||
// Load known domain IDs from the database to avoid inserting duplicates from NDP_NEW_DOMAINS
|
||||
// or domains that are already assigned to a node
|
||||
{
|
||||
IntOpenHashSet knownIds = new IntOpenHashSet();
|
||||
|
||||
try (var stmt = conn.createStatement()) {
|
||||
ResultSet rs = stmt.executeQuery("SELECT DOMAIN_ID FROM NDP_NEW_DOMAINS");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load known domain IDs from database", e);
|
||||
}
|
||||
|
||||
// Ensure the link graph is ready before proceeding. This is mainly necessary in a cold reboot
|
||||
// of the entire system.
|
||||
try {
|
||||
logger.info("Waiting for link graph client to be ready...");
|
||||
linkGraphClient.waitReady(Duration.ofHours(1));
|
||||
logger.info("Link graph client is ready, fetching domain links...");
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// Fetch all domain links from the link graph and count by how many sources each dest domain is linked from
|
||||
var iter = linkGraphClient.getAllDomainLinks().iterator();
|
||||
while (iter.advance()) {
|
||||
int dest = iter.dest();
|
||||
if (!knownIds.contains(dest)) {
|
||||
domainIdToCount.mergeInt(dest, 1, (i, j) -> i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean didInsert = false;
|
||||
|
||||
/* Insert new domains into NDP_NEW_DOMAINS table */
|
||||
try (var insertStmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
|
||||
""")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0;
|
||||
for (var entry : domainIdToCount.int2IntEntrySet()) {
|
||||
int domainId = entry.getIntKey();
|
||||
int count = entry.getIntValue();
|
||||
|
||||
insertStmt.setInt(1, domainId);
|
||||
insertStmt.setInt(2, count);
|
||||
insertStmt.addBatch();
|
||||
|
||||
if (++cnt >= 1000) {
|
||||
cnt = 0;
|
||||
insertStmt.executeBatch(); // Execute in batches to avoid memory issues
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
}
|
||||
if (cnt != 0) {
|
||||
insertStmt.executeBatch(); // Execute any remaining batch
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
|
||||
logger.info("Queue refreshed successfully");
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to refresh queue in database", e);
|
||||
}
|
||||
|
||||
// Clean up NDP_NEW_DOMAINS table to remove any domains that are already in EC_DOMAIN
|
||||
// This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
|
||||
// repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
|
||||
try (var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);
|
||||
}
|
||||
|
||||
return didInsert;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,159 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ndp.NdpRequest;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.Security;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class NdpMain extends ProcessMainClass {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NdpMain.class);
|
||||
private final DomainNodeAllocator domainNodeAllocator;
|
||||
private final DomainTestingQueue domainTestingQueue;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
private final DomainEvaluator domainEvaluator;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
|
||||
private final AtomicInteger domainCount = new AtomicInteger(0);
|
||||
|
||||
@Inject
|
||||
public NdpMain(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
DomainNodeAllocator domainNodeAllocator,
|
||||
DomainTestingQueue domainTestingQueue,
|
||||
DomainEvaluator domainEvaluator,
|
||||
DomainBlacklist domainBlacklist,
|
||||
ProcessHeartbeat processHeartbeat,
|
||||
Gson gson)
|
||||
{
|
||||
super(messageQueueFactory, config, gson, ProcessInboxNames.NDP_INBOX);
|
||||
|
||||
this.domainNodeAllocator = domainNodeAllocator;
|
||||
this.domainEvaluator = domainEvaluator;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.domainTestingQueue = domainTestingQueue;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
|
||||
public void run(int goalCount) throws InterruptedException {
|
||||
logger.info("Wait for blacklist to load...");
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
|
||||
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(
|
||||
"NDP-Worker",
|
||||
8,
|
||||
10,
|
||||
SimpleBlockingThreadPool.ThreadType.PLATFORM
|
||||
);
|
||||
|
||||
logger.info("Starting NDP process");
|
||||
|
||||
int toInsertCount = goalCount - domainNodeAllocator.totalCount();
|
||||
|
||||
if (toInsertCount <= 0) {
|
||||
logger.info("No new domains to process. Current count: " + domainNodeAllocator.totalCount());
|
||||
return;
|
||||
}
|
||||
|
||||
try (var hb = processHeartbeat.createAdHocTaskHeartbeat("Growing Index")) {
|
||||
int cnt;
|
||||
while ((cnt = domainCount.get()) < toInsertCount) {
|
||||
if (cnt % 100 == 0) {
|
||||
hb.progress("Discovery Process", cnt, toInsertCount);
|
||||
}
|
||||
|
||||
final DomainToTest nextDomain = domainTestingQueue.next();
|
||||
threadPool.submit(() -> {
|
||||
try {
|
||||
if (domainEvaluator.evaluateDomain(nextDomain.domainName())) {
|
||||
logger.info("Accepting: {}", nextDomain.domainName());
|
||||
domainCount.incrementAndGet();
|
||||
domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
|
||||
} else {
|
||||
logger.info("Rejecting: {}", nextDomain.domainName());
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
threadPool.shutDown();
|
||||
// Wait for all tasks to complete or give up after 1 hour
|
||||
threadPool.awaitTermination(1, TimeUnit.HOURS);
|
||||
|
||||
logger.info("NDP process completed. Total domains processed: " + domainCount.get());
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||
|
||||
// This must run *early*
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
||||
// If these aren't set properly, the JVM will hang forever on some requests
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new NdpModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new ProcessConfigurationModule("ndp"),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
|
||||
|
||||
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
|
||||
|
||||
NdpMain main = injector.getInstance(NdpMain.class);
|
||||
|
||||
var instructions = main.fetchInstructions(NdpRequest.class);
|
||||
|
||||
try {
|
||||
main.run(instructions.value().goal());
|
||||
instructions.ok();
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.error("Error running ping process", ex);
|
||||
instructions.err();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
|
||||
public class NdpModule extends AbstractModule {
|
||||
public void configure() {
|
||||
}
|
||||
}
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ndp.model;
|
||||
|
||||
public record DomainToTest(String domainName, int domainId) {
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class DomainEvaluatorTest {
|
||||
|
||||
@Tag("flaky") // Exclude from CI runs due to potential network issues
|
||||
@Test
|
||||
public void testSunnyDay() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
DomainEvaluator evaluator = new DomainEvaluator(new LocalDomainCoordinator());
|
||||
|
||||
// Should be a valid domain
|
||||
assertTrue(evaluator.evaluateDomain("www.marginalia.nu"));
|
||||
|
||||
// Should be a redirect to www.marginalia.nu
|
||||
assertFalse(evaluator.evaluateDomain("memex.marginalia.nu"));
|
||||
|
||||
// Should fail on Anubis
|
||||
assertFalse(evaluator.evaluateDomain("marginalia-search.com"));
|
||||
}
|
||||
}
|
12
code/processes/ping-process/README.md
Normal file
12
code/processes/ping-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The ping process (which has nothing to do with ICMP ping) keeps track of
|
||||
the aliveness of websites. It also gathers fingerprint information about
|
||||
the security posture of the website, as well as DNS information.
|
||||
|
||||
This is kept to build an idea of when a website is down, and to identify
|
||||
ownership changes, as well as other significant events in the lifecycle
|
||||
of a website.
|
||||
|
||||
# Central Classes
|
||||
|
||||
* [PingMain](java/nu/marginalia/ping/PingMain.java) main class.
|
||||
* [PingJobScheduler](java/nu/marginalia/ping/PingJobScheduler.java) service that dispatches pings.
|
@@ -61,14 +61,14 @@ public class BackoffStrategy {
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
* Math.pow(multiplier, Math.clamp(backoffConsecutiveFailures, 1, 10));
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
var backoffVal = Math.round(0.5+backoffMinutes);
|
||||
if (backoffVal > maxInterval.toMinutes()) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
return Duration.ofMinutes(backoffVal);
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
|
@@ -185,12 +185,12 @@ public class PingDao {
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<UpdateSchedule.UpdateJob<Long, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<Long, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
|
||||
public List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT ID, NEXT_SCHEDULED_UPDATE
|
||||
SELECT ID, DOMAIN_NAME, NEXT_SCHEDULED_UPDATE
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION
|
||||
ON EC_DOMAIN.ID = DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID
|
||||
@@ -200,11 +200,13 @@ public class PingDao {
|
||||
ps.setInt(1, nodeId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
long domainId = rs.getLong("ID");
|
||||
int domainId = rs.getInt("ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
var ts = rs.getTimestamp("NEXT_SCHEDULED_UPDATE");
|
||||
Instant nextUpdate = ts == null ? Instant.now() : ts.toInstant();
|
||||
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(domainId, nextUpdate));
|
||||
var ref = new DomainReference(domainId, nodeId, domainName.toLowerCase());
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(ref, nextUpdate));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to retrieve domain update schedule", e);
|
||||
@@ -241,7 +243,7 @@ public class PingDao {
|
||||
else {
|
||||
var record = new DomainDnsRecord(rs);
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(
|
||||
new RootDomainReference.ById(dnsRootDomainId),
|
||||
new RootDomainReference.ByIdAndName(dnsRootDomainId, rootDomainName),
|
||||
Objects.requireNonNullElseGet(record.tsNextScheduledUpdate(), Instant::now))
|
||||
);
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.ping.model.*;
|
||||
import nu.marginalia.ping.svc.DnsPingService;
|
||||
import nu.marginalia.ping.svc.HttpPingService;
|
||||
@@ -31,7 +32,7 @@ public class PingJobScheduler {
|
||||
|
||||
private static final UpdateSchedule<RootDomainReference, RootDomainReference> dnsUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
private static final UpdateSchedule<Long, HistoricalAvailabilityData> availabilityUpdateSchedule
|
||||
private static final UpdateSchedule<DomainReference, HistoricalAvailabilityData> availabilityUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
|
||||
public volatile Instant dnsLastSync = Instant.now();
|
||||
@@ -138,7 +139,15 @@ public class PingJobScheduler {
|
||||
continue;
|
||||
}
|
||||
|
||||
long nextId = availabilityUpdateSchedule.next();
|
||||
DomainReference ref = availabilityUpdateSchedule.nextIf(domain -> {
|
||||
EdgeDomain domainObj = new EdgeDomain(domain.domainName());
|
||||
if (!domainCoordinator.isLockableHint(domainObj)) {
|
||||
return false; // Skip locked domains
|
||||
}
|
||||
return true; // Process this domain
|
||||
});
|
||||
|
||||
long nextId = ref.domainId();
|
||||
var data = pingDao.getHistoricalAvailabilityData(nextId);
|
||||
if (data == null) {
|
||||
logger.warn("No availability data found for ID: {}", nextId);
|
||||
@@ -163,7 +172,7 @@ public class PingJobScheduler {
|
||||
for (var object : objects) {
|
||||
var ts = object.nextUpdateTime();
|
||||
if (ts != null) {
|
||||
availabilityUpdateSchedule.add(nextId, ts);
|
||||
availabilityUpdateSchedule.add(ref, ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -194,7 +203,7 @@ public class PingJobScheduler {
|
||||
|
||||
try {
|
||||
List<WritableModel> objects = switch(ref) {
|
||||
case RootDomainReference.ById(long id) -> {
|
||||
case RootDomainReference.ByIdAndName(long id, String name) -> {
|
||||
var oldRecord = Objects.requireNonNull(pingDao.getDomainDnsRecord(id));
|
||||
yield dnsPingService.pingDomain(oldRecord.rootDomainName(), oldRecord);
|
||||
}
|
||||
|
@@ -2,9 +2,8 @@ package nu.marginalia.ping;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** In-memory schedule for updates, allowing jobs to be added and processed in order of their scheduled time.
|
||||
* This is not a particularly high-performance implementation, but exists to take contention off the database's
|
||||
@@ -23,6 +22,9 @@ public class UpdateSchedule<T, T2> {
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
/** Returns the next job in the queue that is due to be processed.
|
||||
* If no jobs are due, it will block until a job is added or a job becomes due.
|
||||
* */
|
||||
public synchronized T next() throws InterruptedException {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
@@ -44,6 +46,56 @@ public class UpdateSchedule<T, T2> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Returns the first job in the queue matching the predicate that is not scheduled into the future,
|
||||
* blocking until a job is added or a job becomes due.
|
||||
*/
|
||||
public synchronized T nextIf(Predicate<T> predicate) throws InterruptedException {
|
||||
List<UpdateJob<T, T2>> rejectedJobs = new ArrayList<>();
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
wait(); // Wait for a new job to be added
|
||||
continue;
|
||||
}
|
||||
|
||||
UpdateJob<T, T2> job = updateQueue.peek();
|
||||
Instant now = Instant.now();
|
||||
|
||||
if (job.updateTime.isAfter(now)) {
|
||||
Duration toWait = Duration.between(now, job.updateTime);
|
||||
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
rejectedJobs.clear();
|
||||
|
||||
wait(Math.max(1, toWait.toMillis()));
|
||||
} else {
|
||||
var candidate = updateQueue.poll(); // Remove the job from the queue since it's due
|
||||
|
||||
assert candidate != null : "Update job should not be null at this point, since we just peeked it in a synchronized block";
|
||||
|
||||
if (!predicate.test(candidate.key())) {
|
||||
rejectedJobs.add(candidate);
|
||||
}
|
||||
else {
|
||||
return candidate.key();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
updateQueue.clear();
|
||||
notifyAll();
|
||||
|
@@ -4,12 +4,14 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.ping.fetcher.response.*;
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.protocol.HttpClientContext;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
|
||||
import javax.net.ssl.SSLHandshakeException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.time.Duration;
|
||||
@@ -82,9 +84,12 @@ public class PingHttpFetcher {
|
||||
});
|
||||
} catch (SocketTimeoutException ex) {
|
||||
return new TimeoutResponse(ex.getMessage());
|
||||
} catch (IOException e) {
|
||||
} catch (HttpHostConnectException | SSLHandshakeException e) {
|
||||
return new ConnectionError(e.getClass().getSimpleName());
|
||||
} catch (IOException e) {
|
||||
return new ProtocolError(e.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -18,13 +18,18 @@ import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.ssl.SSLContextBuilder;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@@ -37,24 +42,55 @@ public class HttpClientProvider implements Provider<HttpClient> {
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
private static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
// No-op up front validation of server certificates.
|
||||
//
|
||||
// We will validate certificates later, after the connection is established
|
||||
// as we want to store the certificate chain and validation
|
||||
// outcome to the database.
|
||||
|
||||
var trustMeBro = new X509TrustManager() {
|
||||
private X509Certificate[] lastServerCertChain;
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) {
|
||||
this.lastServerCertChain = chain.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return new X509Certificate[0];
|
||||
}
|
||||
|
||||
public X509Certificate[] getLastServerCertChain() {
|
||||
return lastServerCertChain != null ? lastServerCertChain.clone() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SSLContext sslContext = SSLContextBuilder.create().build();
|
||||
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(
|
||||
new DefaultClientTlsStrategy(SSLContext.getDefault(), NoopHostnameVerifier.INSTANCE))
|
||||
new DefaultClientTlsStrategy(sslContext, NoopHostnameVerifier.INSTANCE))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
@@ -76,7 +112,7 @@ public class HttpClientProvider implements Provider<HttpClient> {
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.ping.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
@@ -10,6 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
@@ -22,6 +24,8 @@ public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
@@ -1,5 +1,7 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
@@ -279,7 +281,7 @@ implements WritableModel
|
||||
}
|
||||
|
||||
public Builder httpLocation(String httpLocation) {
|
||||
this.httpLocation = httpLocation;
|
||||
this.httpLocation = StringUtils.abbreviate(httpLocation, "...",255);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@@ -18,6 +18,7 @@ public record DomainSecurityEvent(
|
||||
boolean certificatePublicKeyChanged,
|
||||
boolean certificateSerialNumberChanged,
|
||||
boolean certificateIssuerChanged,
|
||||
SchemaChange schemaChange,
|
||||
Duration oldCertificateTimeToExpiry,
|
||||
boolean securityHeadersChanged,
|
||||
boolean ipChanged,
|
||||
@@ -45,8 +46,9 @@ public record DomainSecurityEvent(
|
||||
security_signature_before,
|
||||
security_signature_after,
|
||||
change_certificate_serial_number,
|
||||
change_certificate_issuer
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
change_certificate_issuer,
|
||||
change_schema
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
|
||||
@@ -81,6 +83,7 @@ public record DomainSecurityEvent(
|
||||
|
||||
ps.setBoolean(15, certificateSerialNumberChanged());
|
||||
ps.setBoolean(16, certificateIssuerChanged());
|
||||
ps.setString(17, schemaChange.name());
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
@@ -42,7 +43,10 @@ public record DomainSecurityRecord(
|
||||
@Nullable String headerXXssProtection,
|
||||
@Nullable String headerServer,
|
||||
@Nullable String headerXPoweredBy,
|
||||
@Nullable Instant tsLastUpdate
|
||||
@Nullable Instant tsLastUpdate,
|
||||
@Nullable Boolean sslChainValid,
|
||||
@Nullable Boolean sslHostValid,
|
||||
@Nullable Boolean sslDateValid
|
||||
)
|
||||
implements WritableModel
|
||||
{
|
||||
@@ -102,7 +106,11 @@ public record DomainSecurityRecord(
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_XSS_PROTECTION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_SERVER"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_POWERED_BY"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class));
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class),
|
||||
rs.getObject("SSL_CHAIN_VALID", Boolean.class),
|
||||
rs.getObject("SSL_HOST_VALID", Boolean.class),
|
||||
rs.getObject("SSL_DATE_VALID", Boolean.class)
|
||||
);
|
||||
}
|
||||
|
||||
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
|
||||
@@ -149,8 +157,11 @@ public record DomainSecurityRecord(
|
||||
header_x_powered_by,
|
||||
ssl_cert_public_key_hash,
|
||||
asn,
|
||||
ts_last_update)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
ts_last_update,
|
||||
ssl_chain_valid,
|
||||
ssl_host_valid,
|
||||
ssl_date_valid)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
ps.setInt(1, domainId());
|
||||
@@ -294,6 +305,25 @@ public record DomainSecurityRecord(
|
||||
} else {
|
||||
ps.setTimestamp(32, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
}
|
||||
|
||||
if (sslChainValid() == null) {
|
||||
ps.setNull(33, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(33, sslChainValid());
|
||||
}
|
||||
|
||||
if (sslHostValid() == null) {
|
||||
ps.setNull(34, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(34, sslHostValid());
|
||||
}
|
||||
|
||||
if (sslDateValid() == null) {
|
||||
ps.setNull(35, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(35, sslDateValid());
|
||||
}
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
@@ -332,6 +362,13 @@ public record DomainSecurityRecord(
|
||||
private String headerXPoweredBy;
|
||||
private Instant tsLastUpdate;
|
||||
|
||||
private Boolean isCertChainValid;
|
||||
private Boolean isCertHostValid;
|
||||
private Boolean isCertDateValid;
|
||||
|
||||
|
||||
private static Instant MAX_UNIX_TIMESTAMP = Instant.ofEpochSecond(Integer.MAX_VALUE);
|
||||
|
||||
public Builder() {
|
||||
// Default values for boolean fields
|
||||
this.sslCertWildcard = false;
|
||||
@@ -374,12 +411,18 @@ public record DomainSecurityRecord(
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotBefore(Instant sslCertNotBefore) {
|
||||
public Builder sslCertNotBefore(@NotNull Instant sslCertNotBefore) {
|
||||
if (sslCertNotBefore.isAfter(MAX_UNIX_TIMESTAMP)) {
|
||||
sslCertNotBefore = MAX_UNIX_TIMESTAMP;
|
||||
}
|
||||
this.sslCertNotBefore = sslCertNotBefore;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotAfter(Instant sslCertNotAfter) {
|
||||
public Builder sslCertNotAfter(@NotNull Instant sslCertNotAfter) {
|
||||
if (sslCertNotAfter.isAfter(MAX_UNIX_TIMESTAMP)) {
|
||||
sslCertNotAfter = MAX_UNIX_TIMESTAMP;
|
||||
}
|
||||
this.sslCertNotAfter = sslCertNotAfter;
|
||||
return this;
|
||||
}
|
||||
@@ -499,6 +542,21 @@ public record DomainSecurityRecord(
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslChainValid(@Nullable Boolean isCertChainValid) {
|
||||
this.isCertChainValid = isCertChainValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslHostValid(@Nullable Boolean isCertHostValid) {
|
||||
this.isCertHostValid = isCertHostValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslDateValid(@Nullable Boolean isCertDateValid) {
|
||||
this.isCertDateValid = isCertDateValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainSecurityRecord build() {
|
||||
return new DomainSecurityRecord(
|
||||
domainId,
|
||||
@@ -532,7 +590,10 @@ public record DomainSecurityRecord(
|
||||
headerXXssProtection,
|
||||
headerServer,
|
||||
headerXPoweredBy,
|
||||
tsLastUpdate
|
||||
tsLastUpdate,
|
||||
isCertChainValid,
|
||||
isCertHostValid,
|
||||
isCertDateValid
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public sealed interface RootDomainReference {
|
||||
record ById(long id) implements RootDomainReference { }
|
||||
record ByIdAndName(long id, String name) implements RootDomainReference { }
|
||||
record ByName(String name) implements RootDomainReference { }
|
||||
}
|
||||
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum SchemaChange {
|
||||
UNKNOWN,
|
||||
NONE,
|
||||
HTTP_TO_HTTPS,
|
||||
HTTPS_TO_HTTP;
|
||||
|
||||
public boolean isSignificant() {
|
||||
return this != NONE && this != UNKNOWN;
|
||||
}
|
||||
}
|
@@ -2,6 +2,9 @@ package nu.marginalia.ping.model.comparison;
|
||||
|
||||
import nu.marginalia.ping.model.DomainAvailabilityRecord;
|
||||
import nu.marginalia.ping.model.DomainSecurityRecord;
|
||||
import nu.marginalia.ping.model.HttpSchema;
|
||||
import nu.marginalia.ping.model.SchemaChange;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
@@ -20,7 +23,8 @@ public record SecurityInformationChange(
|
||||
Duration oldCertificateTimeToExpiry,
|
||||
boolean isSecurityHeadersChanged,
|
||||
boolean isIpAddressChanged,
|
||||
boolean isSoftwareHeaderChanged
|
||||
boolean isSoftwareHeaderChanged,
|
||||
SchemaChange schemaChange
|
||||
) {
|
||||
public static SecurityInformationChange between(
|
||||
DomainSecurityRecord before, DomainAvailabilityRecord availabilityBefore,
|
||||
@@ -43,9 +47,10 @@ public record SecurityInformationChange(
|
||||
);
|
||||
|
||||
boolean securityHeadersChanged = before.securityHeadersHash() != after.securityHeadersHash();
|
||||
|
||||
boolean softwareChanged = !Objects.equals(before.headerServer(), after.headerServer());
|
||||
|
||||
SchemaChange schemaChange = getSchemaChange(before, after);
|
||||
|
||||
// Note we don't include IP address changes in the overall change status,
|
||||
// as this is not alone considered a change in security information; we may have
|
||||
// multiple IP addresses for a domain, and the IP address may change frequently
|
||||
@@ -55,7 +60,8 @@ public record SecurityInformationChange(
|
||||
|| certificateFingerprintChanged
|
||||
|| securityHeadersChanged
|
||||
|| certificateProfileChanged
|
||||
|| softwareChanged;
|
||||
|| softwareChanged
|
||||
|| schemaChange.isSignificant();
|
||||
|
||||
return new SecurityInformationChange(
|
||||
isChanged,
|
||||
@@ -69,9 +75,36 @@ public record SecurityInformationChange(
|
||||
oldCertificateTimeToExpiry,
|
||||
securityHeadersChanged,
|
||||
ipChanged,
|
||||
softwareChanged
|
||||
softwareChanged,
|
||||
schemaChange
|
||||
);
|
||||
}
|
||||
|
||||
private static @NotNull SchemaChange getSchemaChange(DomainSecurityRecord before, DomainSecurityRecord after) {
|
||||
if (before.httpSchema() == null || after.httpSchema() == null) {
|
||||
return SchemaChange.UNKNOWN;
|
||||
}
|
||||
|
||||
boolean beforeIsHttp = before.httpSchema() == HttpSchema.HTTP;
|
||||
boolean afterIsHttp = after.httpSchema() == HttpSchema.HTTP;
|
||||
boolean beforeIsHttps = before.httpSchema() == HttpSchema.HTTPS;
|
||||
boolean afterIsHttps = after.httpSchema() == HttpSchema.HTTPS;
|
||||
|
||||
SchemaChange schemaChange;
|
||||
|
||||
if (beforeIsHttp && afterIsHttp) {
|
||||
schemaChange = SchemaChange.NONE;
|
||||
} else if (beforeIsHttps && afterIsHttps) {
|
||||
schemaChange = SchemaChange.NONE;
|
||||
} else if (beforeIsHttp && afterIsHttps) {
|
||||
schemaChange = SchemaChange.HTTP_TO_HTTPS;
|
||||
} else if (beforeIsHttps && afterIsHttp) {
|
||||
schemaChange = SchemaChange.HTTPS_TO_HTTP;
|
||||
} else {
|
||||
schemaChange = SchemaChange.UNKNOWN;
|
||||
}
|
||||
return schemaChange;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.ping.ssl;
|
||||
|
||||
import org.bouncycastle.asn1.ASN1OctetString;
|
||||
import org.bouncycastle.asn1.ASN1Primitive;
|
||||
import org.bouncycastle.asn1.x509.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class AIAExtractor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AIAExtractor.class);
|
||||
|
||||
public static List<String> getCaIssuerUrls(X509Certificate certificate) {
|
||||
List<String> caIssuerUrls = new ArrayList<>();
|
||||
|
||||
try {
|
||||
// Get the AIA extension value
|
||||
byte[] aiaExtensionValue = certificate.getExtensionValue(Extension.authorityInfoAccess.getId());
|
||||
if (aiaExtensionValue == null) {
|
||||
logger.warn("No AIA extension found");
|
||||
return caIssuerUrls;
|
||||
}
|
||||
|
||||
// Parse the extension - first unwrap the OCTET STRING
|
||||
ASN1OctetString octetString = ASN1OctetString.getInstance(aiaExtensionValue);
|
||||
ASN1Primitive aiaObj = ASN1Primitive.fromByteArray(octetString.getOctets());
|
||||
|
||||
// Parse as AuthorityInformationAccess
|
||||
AuthorityInformationAccess aia = AuthorityInformationAccess.getInstance(aiaObj);
|
||||
|
||||
if (aia != null) {
|
||||
AccessDescription[] accessDescriptions = aia.getAccessDescriptions();
|
||||
|
||||
for (AccessDescription accessDesc : accessDescriptions) {
|
||||
// Check if this is a CA Issuers access method
|
||||
if (X509ObjectIdentifiers.id_ad_caIssuers.equals(accessDesc.getAccessMethod())) {
|
||||
GeneralName accessLocation = accessDesc.getAccessLocation();
|
||||
|
||||
// Check if it's a URI
|
||||
if (accessLocation.getTagNo() == GeneralName.uniformResourceIdentifier) {
|
||||
String url = accessLocation.getName().toString();
|
||||
caIssuerUrls.add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Error parsing AIA extension: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return caIssuerUrls;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,273 @@
|
||||
package nu.marginalia.ping.ssl;
|
||||
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClientBuilder;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.bouncycastle.asn1.ASN1OctetString;
|
||||
import org.bouncycastle.asn1.ASN1Primitive;
|
||||
import org.bouncycastle.asn1.x509.*;
|
||||
import org.bouncycastle.cert.X509CertificateHolder;
|
||||
import org.bouncycastle.cert.jcajce.JcaX509CertificateConverter;
|
||||
import org.bouncycastle.cms.CMSSignedData;
|
||||
import org.bouncycastle.openssl.PEMParser;
|
||||
import org.bouncycastle.util.Store;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.cert.CertificateFactory;
|
||||
import java.security.cert.TrustAnchor;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class CertificateFetcher {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CertificateFetcher.class);
|
||||
|
||||
private static HttpClient client = HttpClientBuilder.create()
|
||||
.build();
|
||||
|
||||
private static Cache<String, X509Certificate> cache = CacheBuilder
|
||||
.newBuilder()
|
||||
.expireAfterAccess(Duration.ofHours(6))
|
||||
.maximumSize(10_000)
|
||||
.build();
|
||||
|
||||
|
||||
public static List<X509Certificate> fetchMissingIntermediates(X509Certificate leafCert) {
|
||||
List<X509Certificate> intermediates = new ArrayList<>();
|
||||
|
||||
// Get CA Issuer URLs from AIA extension
|
||||
List<String> caIssuerUrls = AIAExtractor.getCaIssuerUrls(leafCert);
|
||||
|
||||
for (String url : caIssuerUrls) {
|
||||
try {
|
||||
// Check cache first
|
||||
X509Certificate cached = cache.getIfPresent(url);
|
||||
if (cached != null) {
|
||||
intermediates.add(cached);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Download certificate
|
||||
X509Certificate downloaded = downloadCertificate(url);
|
||||
if (downloaded != null) {
|
||||
// Verify this certificate can actually sign the leaf
|
||||
if (canSign(downloaded, leafCert)) {
|
||||
intermediates.add(downloaded);
|
||||
cache.put(url, downloaded);
|
||||
logger.info("Downloaded certificate for url: {}", url);
|
||||
} else {
|
||||
logger.warn("Downloaded certificate cannot sign leaf cert from: {}", url);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to fetch certificate from {}: {}", url, e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
return intermediates;
|
||||
}
|
||||
private static X509Certificate downloadCertificate(String urlString) {
|
||||
try {
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.create("GET")
|
||||
.addHeader("User-Agent", WmsaHome.getUserAgent() + " (Certificate Fetcher)")
|
||||
.setUri(urlString)
|
||||
.build();
|
||||
|
||||
byte[] data = client.execute(request, rsp -> {
|
||||
var entity = rsp.getEntity();
|
||||
if (entity == null) {
|
||||
logger.warn("GET request returned no content for {}", urlString);
|
||||
return null;
|
||||
}
|
||||
return entity.getContent().readAllBytes();
|
||||
});
|
||||
|
||||
if (data.length == 0) {
|
||||
logger.warn("Empty response from {}", urlString);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Try different formats based on file extension
|
||||
if (urlString.toLowerCase().endsWith(".p7c") || urlString.toLowerCase().endsWith(".p7b")) {
|
||||
return parsePKCS7(data);
|
||||
} else {
|
||||
return parseX509(data);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to fetch certificate from {}: {}", urlString, e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static List<X509Certificate> parseMultiplePEM(byte[] data) throws Exception {
|
||||
List<X509Certificate> certificates = new ArrayList<>();
|
||||
|
||||
try (StringReader stringReader = new StringReader(new String(data, StandardCharsets.UTF_8));
|
||||
PEMParser pemParser = new PEMParser(stringReader)) {
|
||||
|
||||
JcaX509CertificateConverter converter = new JcaX509CertificateConverter();
|
||||
Object object;
|
||||
|
||||
while ((object = pemParser.readObject()) != null) {
|
||||
if (object instanceof X509CertificateHolder) {
|
||||
X509CertificateHolder certHolder = (X509CertificateHolder) object;
|
||||
certificates.add(converter.getCertificate(certHolder));
|
||||
} else if (object instanceof X509Certificate) {
|
||||
certificates.add((X509Certificate) object);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return certificates;
|
||||
}
|
||||
private static X509Certificate parseX509(byte[] data) throws Exception {
|
||||
CertificateFactory cf = CertificateFactory.getInstance("X.509");
|
||||
return (X509Certificate) cf.generateCertificate(new ByteArrayInputStream(data));
|
||||
}
|
||||
|
||||
private static X509Certificate parsePKCS7(byte[] data) throws Exception {
|
||||
try {
|
||||
// Parse PKCS#7/CMS structure
|
||||
CMSSignedData cmsData = new CMSSignedData(data);
|
||||
Store<X509CertificateHolder> certStore = cmsData.getCertificates();
|
||||
|
||||
JcaX509CertificateConverter converter = new JcaX509CertificateConverter();
|
||||
|
||||
// Get the first certificate from the store
|
||||
for (X509CertificateHolder certHolder : certStore.getMatches(null)) {
|
||||
X509Certificate cert = converter.getCertificate(certHolder);
|
||||
return cert;
|
||||
}
|
||||
|
||||
logger.warn("No certificates found in PKCS#7 structure");
|
||||
return null;
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to parse PKCS#7 structure from {}: {}", data.length, e.getMessage());
|
||||
return parseX509(data);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean canSign(X509Certificate issuerCert, X509Certificate subjectCert) {
|
||||
try {
|
||||
// Check if the issuer DN matches
|
||||
if (!issuerCert.getSubjectDN().equals(subjectCert.getIssuerDN())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Try to verify the signature
|
||||
subjectCert.verify(issuerCert.getPublicKey());
|
||||
return true;
|
||||
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursive fetching for complete chains
|
||||
public static List<X509Certificate> buildCompleteChain(X509Certificate leafCert) {
|
||||
List<X509Certificate> completeChain = new ArrayList<>();
|
||||
completeChain.add(leafCert);
|
||||
|
||||
X509Certificate currentCert = leafCert;
|
||||
int maxDepth = 10; // Prevent infinite loops
|
||||
|
||||
while (maxDepth-- > 0) {
|
||||
// If current cert is self-signed (root), we're done
|
||||
if (currentCert.getSubjectDN().equals(currentCert.getIssuerDN())) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Try to find the issuer
|
||||
List<X509Certificate> intermediates = fetchMissingIntermediates(currentCert);
|
||||
if (intermediates.isEmpty()) {
|
||||
logger.error("Could not find issuer for: {}", currentCert.getSubjectDN());
|
||||
break;
|
||||
}
|
||||
|
||||
// Add the first valid intermediate
|
||||
X509Certificate intermediate = intermediates.get(0);
|
||||
completeChain.add(intermediate);
|
||||
currentCert = intermediate;
|
||||
}
|
||||
|
||||
return completeChain;
|
||||
}
|
||||
|
||||
// Add this to your AIAExtractor class if not already present
|
||||
public static List<String> getOCSPUrls(X509Certificate certificate) {
|
||||
List<String> ocspUrls = new ArrayList<>();
|
||||
|
||||
try {
|
||||
byte[] aiaExtensionValue = certificate.getExtensionValue(Extension.authorityInfoAccess.getId());
|
||||
if (aiaExtensionValue == null) {
|
||||
return ocspUrls;
|
||||
}
|
||||
|
||||
ASN1OctetString octetString = ASN1OctetString.getInstance(aiaExtensionValue);
|
||||
ASN1Primitive aiaObj = ASN1Primitive.fromByteArray(octetString.getOctets());
|
||||
AuthorityInformationAccess aia = AuthorityInformationAccess.getInstance(aiaObj);
|
||||
|
||||
if (aia != null) {
|
||||
AccessDescription[] accessDescriptions = aia.getAccessDescriptions();
|
||||
|
||||
for (AccessDescription accessDesc : accessDescriptions) {
|
||||
if (X509ObjectIdentifiers.id_ad_ocsp.equals(accessDesc.getAccessMethod())) {
|
||||
GeneralName accessLocation = accessDesc.getAccessLocation();
|
||||
if (accessLocation.getTagNo() == GeneralName.uniformResourceIdentifier) {
|
||||
String url = accessLocation.getName().toString();
|
||||
ocspUrls.add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Error parsing AIA extension for OCSP: {}", e.getMessage());
|
||||
}
|
||||
|
||||
return ocspUrls;
|
||||
}
|
||||
|
||||
public static Set<TrustAnchor> getRootCerts(String bundleUrl) throws Exception {
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.create("GET")
|
||||
.addHeader("User-Agent", WmsaHome.getUserAgent() + " (Certificate Fetcher)")
|
||||
.setUri(bundleUrl)
|
||||
.build();
|
||||
|
||||
byte[] data = client.execute(request, rsp -> {
|
||||
var entity = rsp.getEntity();
|
||||
if (entity == null) {
|
||||
logger.warn("GET request returned no content for {}", bundleUrl);
|
||||
return null;
|
||||
}
|
||||
return entity.getContent().readAllBytes();
|
||||
});
|
||||
|
||||
List<TrustAnchor> anchors = new ArrayList<>();
|
||||
for (var cert : parseMultiplePEM(data)) {
|
||||
try {
|
||||
anchors.add(new TrustAnchor(cert, null));
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to create TrustAnchor for certificate: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Loaded {} root certificates from {}", anchors.size(), bundleUrl);
|
||||
|
||||
return Set.copyOf(anchors);
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user