1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

16 Commits

Author SHA1 Message Date
Viktor Lofgren
36581b25c2 (ndp) Fix process tracking in domain discovery process 2025-06-21 14:35:25 +02:00
Viktor Lofgren
52ff7fb4dd (ndp) Add a process for adding new domains to be crawled
This is a working "work in progress" commit, will need more refinement, but given the usual difficulties in testing crawler-adjacent code without actually crawling, it needs some maturation time in production.
2025-06-21 14:10:27 +02:00
Viktor Lofgren
a4e49e658a (ping) Add README for ping 2025-06-19 11:21:52 +02:00
Viktor Lofgren
e2c56dc3ca (search) Clean up the rate limiting
We fail quietly to make life harder for the bot farmers
2025-06-18 11:26:30 +02:00
Viktor Lofgren
470b866008 (search) Clean up the rate limiting
We fail quietly to make life harder for the bot farmers
2025-06-18 11:22:26 +02:00
Viktor Lofgren
4895a2ac7a (search) Clean up the rate limiting
We fail quietly to make life harder for the bot farmers
2025-06-18 11:20:24 +02:00
Viktor Lofgren
fd32ae9fa7 (search) Add automatic rate limiting to /site
Fix typo
2025-06-18 11:10:08 +02:00
Viktor Lofgren
470651ea4c (search) Add automatic rate limiting to /site 2025-06-18 11:04:36 +02:00
Viktor Lofgren
8d4829e783 (ping) Change cookie specification to ignore cookies 2025-06-17 12:26:34 +02:00
Viktor Lofgren
1290bc15dc (ping) Reduce retries for SocketException and pals 2025-06-16 22:35:33 +02:00
Viktor Lofgren
e7fa558954 (ping) Disable some cert validation logic for now 2025-06-16 22:00:32 +02:00
Viktor Lofgren
720685bf3f (ping) Persist more detailed information about why a cert is invalid
The change also alters the validator to be less judgemental, and accept some invalid chains based on looking like we've simply not got access to a (valid) intermediate cert.
2025-06-16 19:44:22 +02:00
Viktor Lofgren
cbec63c7da (ping) Pull root certificates from cacerts.pem 2025-06-16 19:21:05 +02:00
Viktor Lofgren
b03ca75785 (ping) Correct test so that it does not spam an innocent webmaster with requests 2025-06-16 17:06:14 +02:00
Viktor Lofgren
184aedc071 (ping) Deploy new custom cert validator for fingerprinting purposes 2025-06-16 16:36:23 +02:00
Viktor Lofgren
0275bad281 (ping) Limit SSL certificate validity dates to a maximum timestamp as permitted by database 2025-06-16 00:32:03 +02:00
43 changed files with 1948 additions and 656 deletions

View File

@@ -0,0 +1,7 @@
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
-- to make it easier to get more information about the SSL certificate's validity
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;

View File

@@ -0,0 +1,12 @@
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
-- be crawled.
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
DOMAIN_ID INT NOT NULL PRIMARY KEY,
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
PRIORITY INT NOT NULL DEFAULT 0,
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
CHECK_COUNT INT NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);

View File

@@ -20,6 +20,7 @@ dependencies {
implementation project(':code:processes:live-crawling-process')
implementation project(':code:processes:loading-process')
implementation project(':code:processes:ping-process')
implementation project(':code:processes:new-domain-process')
implementation project(':code:processes:converting-process')
implementation project(':code:processes:index-constructor-process')
@@ -41,7 +42,6 @@ dependencies {
implementation project(':code:functions:nsfw-domain-filter')
implementation project(':code:execution:api')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:model')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation project(':code:index:index-journal')

View File

@@ -14,6 +14,7 @@ public enum ExecutorActor {
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),

View File

@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
RecrawlSingleDomainActor recrawlSingleDomainActor,
RestoreBackupActor restoreBackupActor,
ConverterMonitorActor converterMonitorFSM,
NdpMonitorActor ndpMonitorActor,
PingMonitorActor pingMonitorActor,
CrawlerMonitorActor crawlerMonitorActor,
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
@@ -93,7 +94,7 @@ public class ExecutorActorControlService {
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);

View File

@@ -0,0 +1,29 @@
package nu.marginalia.actor.proc;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.mqapi.ProcessInboxNames;
import nu.marginalia.process.ProcessService;
import nu.marginalia.service.module.ServiceConfiguration;
@Singleton
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
@Inject
public NdpMonitorActor(Gson gson,
ServiceConfiguration configuration,
MqPersistence persistence,
ProcessService processService) {
super(gson,
configuration,
persistence,
processService,
ProcessInboxNames.NDP_INBOX,
ProcessService.ProcessId.NDP);
}
}

View File

@@ -8,6 +8,7 @@ import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.index.IndexConstructorMain;
import nu.marginalia.livecrawler.LiveCrawlerMain;
import nu.marginalia.loading.LoaderMain;
import nu.marginalia.ndp.NdpMain;
import nu.marginalia.ping.PingMain;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.server.BaseServiceParams;
@@ -57,6 +58,7 @@ public class ProcessService {
CONVERTER(ConverterMain.class),
LOADER(LoaderMain.class),
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
NDP(NdpMain.class),
EXPORT_TASKS(ExportTasksMain.class),
;
@@ -72,6 +74,7 @@ public class ProcessService {
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
case LOADER -> "LOADER_PROCESS_OPTS";
case PING -> "PING_PROCESS_OPTS";
case NDP -> "NDP_PROCESS_OPTS";
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
};

View File

@@ -0,0 +1,73 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
application {
mainClass = 'nu.marginalia.ping.PingMain'
applicationName = 'ping-process'
}
tasks.distZip.enabled = false
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:libraries:domain-lock')
implementation project(':code:libraries:geo-ip')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:processes:process-mq-api')
implementation project(':code:processes:crawling-process:ft-content-type')
implementation project(':code:processes:crawling-process:ft-link-parser')
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.gson
implementation libs.zstd
implementation libs.bucket4j
implementation libs.crawlercommons
implementation libs.jsoup
implementation libs.fastutil
implementation libs.bundles.curator
implementation libs.bundles.mariadb
implementation libs.bundles.httpcomponents
implementation libs.commons.lang3
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation libs.wiremock
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
testImplementation project(':code:processes:test-data')
}

View File

@@ -0,0 +1,142 @@
package nu.marginalia.ndp;
import com.google.inject.Inject;
import nu.marginalia.WmsaHome;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.coordination.DomainCoordinator;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.ndp.io.HttpClientProvider;
import nu.marginalia.ndp.model.DomainToTest;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.core5.http.ClassicHttpResponse;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.time.Duration;
import java.time.Instant;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
public class DomainEvaluator {
private final HttpClient client;
private final String userAgentString = WmsaHome.getUserAgent().uaString();
private final LinkParser linkParser = new LinkParser();
private final DomainCoordinator domainCoordinator;
sealed interface FetchResult permits FetchSuccess, FetchFailure {}
record FetchSuccess(Document content) implements FetchResult {}
record FetchFailure(String reason) implements FetchResult {}
@Inject
public DomainEvaluator(DomainCoordinator domainCoordinator) throws NoSuchAlgorithmException, KeyManagementException {
this.domainCoordinator = domainCoordinator;
client = HttpClientProvider.createClient();
}
public boolean evaluateDomain(DomainToTest domain) throws Exception {
var edgeDomain = new EdgeDomain(domain.domainName());
try (var lock = domainCoordinator.lockDomain(edgeDomain)) {
var result = fetch(domain.domainName());
Instant start = Instant.now();
var ret = switch(result) {
case FetchSuccess(Document content) -> validateHtml(content, edgeDomain);
case FetchFailure failure -> false;
};
// Sleep for up to 1 second before we yield the lock to respect rate limits reasonably well
Instant end = Instant.now();
Duration sleepDuration = Duration.ofSeconds(1).minus(Duration.between(start, end));
if (sleepDuration.isPositive()) {
TimeUnit.MILLISECONDS.sleep(sleepDuration.toMillis());
}
return ret;
}
}
private boolean validateHtml(Document content, EdgeDomain domain) {
var rootUrl = domain.toRootUrlHttps();
var text = content.body().text();
if (text.length() < 100) {
return false; // Too short to be a valid page
}
if (text.contains("404 Not Found") || text.contains("Page not found")) {
return false; // Common indicators of a 404 page
}
for (var metaTag : content.select("meta")) {
if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv"))) {
return false; // Page has a refresh tag, very likely a parked domain
}
}
boolean hasInternalLink = false;
for (var atag : content.select("a")) {
var link = linkParser.parseLink(rootUrl, atag);
if (link.isEmpty()) {
continue; // Skip invalid links
}
var edgeUrl = link.get();
if (Objects.equals(domain, edgeUrl.getDomain())) {
hasInternalLink = true;
}
}
return hasInternalLink;
}
private FetchResult fetch(String domain) throws URISyntaxException {
var uri = new URI("https://" + domain + "/");
var request = ClassicRequestBuilder.get(uri)
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
.build();
try {
return client.execute(request, (rsp) -> responseHandler(rsp, domain));
} catch (Exception e) {
return new FetchFailure("Failed to fetch domain: " + e.getMessage());
}
}
private FetchResult responseHandler(ClassicHttpResponse rsp, String domain) {
if (rsp.getEntity() == null)
return new FetchFailure("No content returned from " + domain);
try {
int code = rsp.getCode();
byte[] content = rsp.getEntity().getContent().readAllBytes();
if (code >= 300) {
return new FetchFailure("Received HTTP " + code + " from " + domain);
}
ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
var html = DocumentBodyToString.getStringData(contentType, content);
return new FetchSuccess(Jsoup.parse(html));
}
catch (Exception e) {
EntityUtils.consumeQuietly(rsp.getEntity());
return new FetchFailure("Failed to read content from " + domain + ": " + e.getMessage());
}
}
}

View File

@@ -0,0 +1,127 @@
package nu.marginalia.ndp;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.nodecfg.NodeConfigurationService;
import org.jetbrains.annotations.NotNull;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Set;
/** DomainAllocator is responsible for assigning domains to partitions/nodes.
* This is ensured to make sure that domains are evenly distributed across the nodes.
*/
public class DomainNodeAllocator {
private final NodeConfigurationService nodeConfigurationService;
private final HikariDataSource dataSource;
private record NodeCount(int nodeId, int count)
implements Comparable<NodeCount>
{
public NodeCount incrementCount() {
return new NodeCount(nodeId, count + 1);
}
@Override
public int compareTo(@NotNull DomainNodeAllocator.NodeCount o) {
return Integer.compare(this.count, o.count);
}
}
private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
volatile boolean initialized = false;
@Inject
public DomainNodeAllocator(NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) {
this.nodeConfigurationService = nodeConfigurationService;
this.dataSource = dataSource;
Thread.ofPlatform()
.name("DomainNodeAllocator::initialize()")
.start(this::initialize);
}
public void initialize() {
if (initialized) return;
Set<Integer> viableNodes = new HashSet<>();
// Find all viable nodes that can handle batch crawls
for (var node : nodeConfigurationService.getAll()) {
if (node.disabled())
continue;
if (node.profile().permitBatchCrawl())
viableNodes.add(node.node());
}
// Fetch the current counts of domains per node from the database
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT COUNT(*) AS CNT, NODE_AFFINITY
FROM EC_DOMAIN
WHERE NODE_AFFINITY>0
GROUP BY NODE_AFFINITY
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
int nodeId = rs.getInt("NODE_AFFINITY");
int count = rs.getInt("CNT");
if (viableNodes.remove(nodeId)) {
countPerNode.add(new NodeCount(nodeId, count));
}
}
} catch (Exception e) {
throw new RuntimeException("Failed to load domain counts from database", e);
}
// Add any remaining viable nodes that were not found in the database
for (int nodeId : viableNodes) {
countPerNode.add(new NodeCount(nodeId, 0));
}
initialized = true;
}
private void ensureInitialized() {
if (initialized) return;
synchronized (this) {
while (!initialized) {
try {
// Wait until the initialization is complete
this.wait(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("DomainAllocator initialization interrupted", e);
}
}
}
}
public synchronized int totalCount() {
ensureInitialized();
return countPerNode.stream().mapToInt(NodeCount::count).sum();
}
/** Returns the next node ID to assign a domain to.
* This method is synchronized to ensure thread safety when multiple threads are allocating domains.
* The node ID returned is guaranteed to be one of the viable nodes configured in the system.
*/
public synchronized int nextNodeId() {
ensureInitialized();
// Synchronized is fine here as this is not a hot path
// (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
NodeCount allocation = countPerNode.remove();
countPerNode.add(allocation.incrementCount());
return allocation.nodeId();
}
}

View File

@@ -0,0 +1,148 @@
package nu.marginalia.ndp;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.ndp.model.DomainToTest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
public class DomainTestingQueue {
private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(1000);
// This will grow quite large, but should be manageable in memory, as theoretical maximum is around 100M domains,
// order of 2 GB in memory.
private final ConcurrentHashMap<String, Boolean> takenDomains = new ConcurrentHashMap<>();
private final HikariDataSource dataSource;
private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);
@Inject
public DomainTestingQueue(HikariDataSource dataSource) {
this.dataSource = dataSource;
Thread.ofPlatform()
.name("DomainTestingQueue::fetch()")
.start(this::fetch);
}
public DomainToTest next() throws InterruptedException {
return queue.take();
}
public void accept(DomainToTest domain, int nodeId) {
try (var conn = dataSource.getConnection();
var flagOkStmt = conn.prepareStatement("""
UPDATE NDP_NEW_DOMAINS
SET STATE='ACCEPTED'
WHERE DOMAIN_ID=?
""");
var assigNodeStmt = conn.prepareStatement("""
UPDATE EC_DOMAIN SET NODE_AFFINITY=?
WHERE ID=?
""")
)
{
conn.setAutoCommit(false);
flagOkStmt.setInt(1, domain.domainId());
flagOkStmt.executeUpdate();
assigNodeStmt.setInt(1, nodeId);
assigNodeStmt.setInt(2, domain.domainId());
assigNodeStmt.executeUpdate();
conn.commit();
} catch (Exception e) {
throw new RuntimeException("Failed to accept domain in database", e);
}
}
public void reject(DomainToTest domain) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
UPDATE NDP_NEW_DOMAINS
SET STATE='REJECTED', CHECK_COUNT=CHECK_COUNT + 1
WHERE DOMAIN_ID=?
"""))
{
conn.setAutoCommit(false);
stmt.setInt(1, domain.domainId());
stmt.executeUpdate();
conn.commit();
} catch (Exception e) {
throw new RuntimeException("Failed to reject domain in database", e);
}
}
public void fetch() {
while (true) {
List<DomainToTest> domains = new ArrayList<>(2000);
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT DOMAIN_ID, DOMAIN_NAME
FROM NDP_NEW_DOMAINS
INNER JOIN EC_DOMAIN ON ID=DOMAIN_ID
WHERE NDP_NEW_DOMAINS.STATE = 'NEW'
ORDER BY PRIORITY DESC
LIMIT 2000
"""))
{
var rs = stmt.executeQuery();
while (rs.next()) {
int domainId = rs.getInt("DOMAIN_ID");
String domainName = rs.getString("DOMAIN_NAME");
if (takenDomains.put(domainName, true) != null) {
logger.warn("Domain {} is already processed, skipping", domainName);
continue; // Skip if already taken
}
domains.add(new DomainToTest(domainName, domainId));
}
if (domains.isEmpty()) {
refreshQueue(conn);
}
}
catch (Exception e) {
throw new RuntimeException("Failed to fetch domains from database", e);
}
try {
for (var domain : domains) {
queue.put(domain);
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Domain fetching interrupted", e);
}
}
}
private void refreshQueue(Connection conn) {
logger.info("Refreshing domain queue in database");
try (var stmt = conn.createStatement()) {
conn.setAutoCommit(false);
logger.info("Revitalizing rejected domains");
// Revitalize rejected domains
stmt.executeUpdate("""
UPDATE NDP_NEW_DOMAINS
SET STATE='NEW'
WHERE NDP_NEW_DOMAINS.STATE = 'REJECTED'
AND DATE_ADD(TS_CHANGE, INTERVAL CHECK_COUNT DAY) > NOW()
""");
conn.commit();
logger.info("Queue refreshed successfully");
} catch (Exception e) {
throw new RuntimeException("Failed to refresh queue in database", e);
}
}
}

View File

@@ -0,0 +1,162 @@
package nu.marginalia.ndp;
import com.google.gson.Gson;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.WmsaHome;
import nu.marginalia.coordination.DomainCoordinationModule;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.geoip.GeoIpDictionary;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mqapi.ProcessInboxNames;
import nu.marginalia.mqapi.ndp.NdpRequest;
import nu.marginalia.ndp.model.DomainToTest;
import nu.marginalia.process.ProcessConfiguration;
import nu.marginalia.process.ProcessConfigurationModule;
import nu.marginalia.process.ProcessMainClass;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.security.Security;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
public class NdpMain extends ProcessMainClass {
private static final Logger logger = LoggerFactory.getLogger(NdpMain.class);
private final DomainNodeAllocator domainNodeAllocator;
private final DomainTestingQueue domainTestingQueue;
private final ProcessHeartbeat processHeartbeat;
private final DomainEvaluator domainEvaluator;
private final DomainBlacklist domainBlacklist;
private final AtomicInteger domainCount = new AtomicInteger(0);
@Inject
public NdpMain(MessageQueueFactory messageQueueFactory,
ProcessConfiguration config,
DomainNodeAllocator domainNodeAllocator,
DomainTestingQueue domainTestingQueue,
DomainEvaluator domainEvaluator,
DomainBlacklist domainBlacklist,
ProcessHeartbeat processHeartbeat,
Gson gson)
{
super(messageQueueFactory, config, gson, ProcessInboxNames.NDP_INBOX);
this.domainNodeAllocator = domainNodeAllocator;
this.domainEvaluator = domainEvaluator;
this.domainBlacklist = domainBlacklist;
this.domainTestingQueue = domainTestingQueue;
this.processHeartbeat = processHeartbeat;
}
public void run(int goalCount) throws InterruptedException {
logger.info("Wait for blacklist to load...");
domainBlacklist.waitUntilLoaded();
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(
"NDP-Worker",
8,
10,
SimpleBlockingThreadPool.ThreadType.PLATFORM
);
logger.info("Starting NDP process");
int toInsertCount = goalCount - domainNodeAllocator.totalCount();
if (toInsertCount <= 0) {
logger.info("No new domains to process. Current count: " + domainNodeAllocator.totalCount());
return;
}
try (var hb = processHeartbeat.createAdHocTaskHeartbeat("Growing Index")) {
int cnt;
while ((cnt = domainCount.get()) < toInsertCount) {
if (cnt % 100 == 0) {
hb.progress("Discovery Process", cnt, toInsertCount);
}
var nextDomain = domainTestingQueue.next();
threadPool.submit(() -> evaluateDomain(nextDomain));
}
}
threadPool.shutDown();
// Wait for all tasks to complete or give up after 1 hour
threadPool.awaitTermination(1, TimeUnit.HOURS);
logger.info("NDP process completed. Total domains processed: " + domainCount.get());
}
private void evaluateDomain(DomainToTest nextDomain) {
try {
if (domainEvaluator.evaluateDomain(nextDomain)) {
logger.info("Accepting: {}", nextDomain.domainName());
domainCount.incrementAndGet();
domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
} else {
logger.info("Rejecting: {}", nextDomain.domainName());
domainTestingQueue.reject(nextDomain);
}
}
catch (Exception e) {
domainTestingQueue.reject(nextDomain);
logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
}
}
public static void main(String[] args) throws Exception {
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
Security.setProperty("networkaddress.cache.ttl" , "3600");
// This must run *early*
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
// If these aren't set properly, the JVM will hang forever on some requests
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
// Set the maximum number of connections to keep alive in the connection pool
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
// We don't want to use too much memory caching sessions for https
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
Injector injector = Guice.createInjector(
new NdpModule(),
new ServiceDiscoveryModule(),
new DomainCoordinationModule(),
new ProcessConfigurationModule("ndp"),
new DatabaseModule(false)
);
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
NdpMain main = injector.getInstance(NdpMain.class);
var instructions = main.fetchInstructions(NdpRequest.class);
try {
main.run(instructions.value().goal());
instructions.ok();
}
catch (Throwable ex) {
logger.error("Error running ping process", ex);
instructions.err();
}
}
}

View File

@@ -0,0 +1,8 @@
package nu.marginalia.ndp;
import com.google.inject.AbstractModule;
public class NdpModule extends AbstractModule {
public void configure() {
}
}

View File

@@ -0,0 +1,126 @@
package nu.marginalia.ndp.io;
import com.google.inject.Provider;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
import org.apache.hc.core5.http.HeaderElement;
import org.apache.hc.core5.http.HeaderElements;
import org.apache.hc.core5.http.HttpResponse;
import org.apache.hc.core5.http.io.SocketConfig;
import org.apache.hc.core5.http.message.MessageSupport;
import org.apache.hc.core5.http.protocol.HttpContext;
import org.apache.hc.core5.util.TimeValue;
import org.apache.hc.core5.util.Timeout;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.util.Iterator;
import java.util.concurrent.TimeUnit;
public class HttpClientProvider implements Provider<HttpClient> {
private static final HttpClient client;
private static PoolingHttpClientConnectionManager connectionManager;
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
static {
try {
client = createClient();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setSocketTimeout(15, TimeUnit.SECONDS)
.setConnectTimeout(15, TimeUnit.SECONDS)
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(50)
.setDefaultConnectionConfig(connectionConfig)
.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(-1))
.setSoTimeout(Timeout.ofSeconds(10))
.build()
);
Thread.ofPlatform().daemon(true).start(() -> {
try {
for (;;) {
TimeUnit.SECONDS.sleep(15);
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.IGNORE)
.setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build();
return HttpClients.custom()
.setConnectionManager(connectionManager)
.setRetryStrategy(new RetryStrategy())
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
// Default keep-alive duration is 3 minutes, but this is too long for us,
// as we are either going to re-use it fairly quickly or close it for a long time.
//
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
@Override
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
while (it.hasNext()) {
final HeaderElement he = it.next();
final String param = he.getName();
final String value = he.getValue();
if (value == null)
continue;
if (!"timeout".equalsIgnoreCase(param))
continue;
try {
long timeout = Long.parseLong(value);
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
return TimeValue.ofSeconds(timeout);
} catch (final NumberFormatException ignore) {
break;
}
}
return defaultValue;
}
})
.disableRedirectHandling()
.setDefaultRequestConfig(defaultRequestConfig)
.build();
}
@Override
public HttpClient get() {
return client;
}
}

View File

@@ -0,0 +1,79 @@
package nu.marginalia.ndp.io;
import org.apache.hc.client5.http.HttpHostConnectException;
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.core5.http.HttpRequest;
import org.apache.hc.core5.http.HttpResponse;
import org.apache.hc.core5.http.protocol.HttpContext;
import org.apache.hc.core5.util.TimeValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLException;
import java.io.IOException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
public class RetryStrategy implements HttpRequestRetryStrategy {
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
@Override
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
return switch (exception) {
case SocketTimeoutException ste -> false;
case SSLException ssle -> false;
case UnknownHostException uhe -> false;
case HttpHostConnectException ex -> executionCount < 2;
case SocketException ex -> executionCount < 2;
default -> executionCount <= 3;
};
}
@Override
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
return switch (response.getCode()) {
case 500, 503 -> executionCount <= 2;
case 429 -> executionCount <= 3;
default -> false;
};
}
@Override
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
return TimeValue.ofSeconds(1);
}
@Override
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
int statusCode = response.getCode();
// Give 503 a bit more time
if (statusCode == 503) return TimeValue.ofSeconds(5);
if (statusCode == 429) {
// get the Retry-After header
var retryAfterHeader = response.getFirstHeader("Retry-After");
if (retryAfterHeader == null) {
return TimeValue.ofSeconds(3);
}
String retryAfter = retryAfterHeader.getValue();
if (retryAfter == null) {
return TimeValue.ofSeconds(2);
}
try {
int retryAfterTime = Integer.parseInt(retryAfter);
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
return TimeValue.ofSeconds(retryAfterTime);
} catch (NumberFormatException e) {
logger.warn("Invalid Retry-After header: {}", retryAfter);
}
}
return TimeValue.ofSeconds(2);
}
}

View File

@@ -0,0 +1,4 @@
package nu.marginalia.ndp.model;
public record DomainToTest(String domainName, int domainId) {
}

View File

@@ -0,0 +1,12 @@
The ping process (which has nothing to do with ICMP ping) keeps track of
the aliveness of websites. It also gathers fingerprint information about
the security posture of the website, as well as DNS information.
This is kept to build an idea of when a website is down, and to identify
ownership changes, as well as other significant events in the lifecycle
of a website.
# Central Classes
* [PingMain](java/nu/marginalia/ping/PingMain.java) main class.
* [PingJobScheduler](java/nu/marginalia/ping/PingJobScheduler.java) service that dispatches pings.

View File

@@ -112,7 +112,7 @@ public class HttpClientProvider implements Provider<HttpClient> {
});
final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.RELAXED)
.setCookieSpec(StandardCookieSpec.IGNORE)
.setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build();

View File

@@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory;
import javax.net.ssl.SSLException;
import java.io.IOException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
@@ -23,7 +24,8 @@ public class RetryStrategy implements HttpRequestRetryStrategy {
case SocketTimeoutException ste -> false;
case SSLException ssle -> false;
case UnknownHostException uhe -> false;
case HttpHostConnectException ex -> executionCount <= 2; // Only retry once for connection errors
case HttpHostConnectException ex -> executionCount < 2;
case SocketException ex -> executionCount < 2;
default -> executionCount <= 3;
};
}

View File

@@ -1,6 +1,7 @@
package nu.marginalia.ping.model;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import javax.annotation.Nullable;
import java.sql.Connection;
@@ -42,7 +43,10 @@ public record DomainSecurityRecord(
@Nullable String headerXXssProtection,
@Nullable String headerServer,
@Nullable String headerXPoweredBy,
@Nullable Instant tsLastUpdate
@Nullable Instant tsLastUpdate,
@Nullable Boolean sslChainValid,
@Nullable Boolean sslHostValid,
@Nullable Boolean sslDateValid
)
implements WritableModel
{
@@ -102,7 +106,11 @@ public record DomainSecurityRecord(
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_XSS_PROTECTION"),
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_SERVER"),
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_POWERED_BY"),
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class));
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class),
rs.getObject("SSL_CHAIN_VALID", Boolean.class),
rs.getObject("SSL_HOST_VALID", Boolean.class),
rs.getObject("SSL_DATE_VALID", Boolean.class)
);
}
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
@@ -149,8 +157,11 @@ public record DomainSecurityRecord(
header_x_powered_by,
ssl_cert_public_key_hash,
asn,
ts_last_update)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
ts_last_update,
ssl_chain_valid,
ssl_host_valid,
ssl_date_valid)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
"""))
{
ps.setInt(1, domainId());
@@ -294,6 +305,25 @@ public record DomainSecurityRecord(
} else {
ps.setTimestamp(32, java.sql.Timestamp.from(tsLastUpdate()));
}
if (sslChainValid() == null) {
ps.setNull(33, java.sql.Types.BOOLEAN);
} else {
ps.setBoolean(33, sslChainValid());
}
if (sslHostValid() == null) {
ps.setNull(34, java.sql.Types.BOOLEAN);
} else {
ps.setBoolean(34, sslHostValid());
}
if (sslDateValid() == null) {
ps.setNull(35, java.sql.Types.BOOLEAN);
} else {
ps.setBoolean(35, sslDateValid());
}
ps.executeUpdate();
}
}
@@ -332,6 +362,13 @@ public record DomainSecurityRecord(
private String headerXPoweredBy;
private Instant tsLastUpdate;
private Boolean isCertChainValid;
private Boolean isCertHostValid;
private Boolean isCertDateValid;
private static Instant MAX_UNIX_TIMESTAMP = Instant.ofEpochSecond(Integer.MAX_VALUE);
public Builder() {
// Default values for boolean fields
this.sslCertWildcard = false;
@@ -374,12 +411,18 @@ public record DomainSecurityRecord(
return this;
}
public Builder sslCertNotBefore(Instant sslCertNotBefore) {
public Builder sslCertNotBefore(@NotNull Instant sslCertNotBefore) {
if (sslCertNotBefore.isAfter(MAX_UNIX_TIMESTAMP)) {
sslCertNotBefore = MAX_UNIX_TIMESTAMP;
}
this.sslCertNotBefore = sslCertNotBefore;
return this;
}
public Builder sslCertNotAfter(Instant sslCertNotAfter) {
public Builder sslCertNotAfter(@NotNull Instant sslCertNotAfter) {
if (sslCertNotAfter.isAfter(MAX_UNIX_TIMESTAMP)) {
sslCertNotAfter = MAX_UNIX_TIMESTAMP;
}
this.sslCertNotAfter = sslCertNotAfter;
return this;
}
@@ -499,6 +542,21 @@ public record DomainSecurityRecord(
return this;
}
public Builder sslChainValid(@Nullable Boolean isCertChainValid) {
this.isCertChainValid = isCertChainValid;
return this;
}
public Builder sslHostValid(@Nullable Boolean isCertHostValid) {
this.isCertHostValid = isCertHostValid;
return this;
}
public Builder sslDateValid(@Nullable Boolean isCertDateValid) {
this.isCertDateValid = isCertDateValid;
return this;
}
public DomainSecurityRecord build() {
return new DomainSecurityRecord(
domainId,
@@ -532,7 +590,10 @@ public record DomainSecurityRecord(
headerXXssProtection,
headerServer,
headerXPoweredBy,
tsLastUpdate
tsLastUpdate,
isCertChainValid,
isCertHostValid,
isCertDateValid
);
}
}

View File

@@ -0,0 +1,59 @@
package nu.marginalia.ping.ssl;
import org.bouncycastle.asn1.ASN1OctetString;
import org.bouncycastle.asn1.ASN1Primitive;
import org.bouncycastle.asn1.x509.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
public class AIAExtractor {
private static final Logger logger = LoggerFactory.getLogger(AIAExtractor.class);
public static List<String> getCaIssuerUrls(X509Certificate certificate) {
List<String> caIssuerUrls = new ArrayList<>();
try {
// Get the AIA extension value
byte[] aiaExtensionValue = certificate.getExtensionValue(Extension.authorityInfoAccess.getId());
if (aiaExtensionValue == null) {
logger.warn("No AIA extension found");
return caIssuerUrls;
}
// Parse the extension - first unwrap the OCTET STRING
ASN1OctetString octetString = ASN1OctetString.getInstance(aiaExtensionValue);
ASN1Primitive aiaObj = ASN1Primitive.fromByteArray(octetString.getOctets());
// Parse as AuthorityInformationAccess
AuthorityInformationAccess aia = AuthorityInformationAccess.getInstance(aiaObj);
if (aia != null) {
AccessDescription[] accessDescriptions = aia.getAccessDescriptions();
for (AccessDescription accessDesc : accessDescriptions) {
// Check if this is a CA Issuers access method
if (X509ObjectIdentifiers.id_ad_caIssuers.equals(accessDesc.getAccessMethod())) {
GeneralName accessLocation = accessDesc.getAccessLocation();
// Check if it's a URI
if (accessLocation.getTagNo() == GeneralName.uniformResourceIdentifier) {
String url = accessLocation.getName().toString();
caIssuerUrls.add(url);
}
}
}
}
} catch (Exception e) {
logger.error("Error parsing AIA extension: {}", e.getMessage());
}
return caIssuerUrls;
}
}

View File

@@ -0,0 +1,273 @@
package nu.marginalia.ping.ssl;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import nu.marginalia.WmsaHome;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.impl.classic.HttpClientBuilder;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.bouncycastle.asn1.ASN1OctetString;
import org.bouncycastle.asn1.ASN1Primitive;
import org.bouncycastle.asn1.x509.*;
import org.bouncycastle.cert.X509CertificateHolder;
import org.bouncycastle.cert.jcajce.JcaX509CertificateConverter;
import org.bouncycastle.cms.CMSSignedData;
import org.bouncycastle.openssl.PEMParser;
import org.bouncycastle.util.Store;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.security.cert.CertificateFactory;
import java.security.cert.TrustAnchor;
import java.security.cert.X509Certificate;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
public class CertificateFetcher {
private static final Logger logger = LoggerFactory.getLogger(CertificateFetcher.class);
private static HttpClient client = HttpClientBuilder.create()
.build();
private static Cache<String, X509Certificate> cache = CacheBuilder
.newBuilder()
.expireAfterAccess(Duration.ofHours(6))
.maximumSize(10_000)
.build();
public static List<X509Certificate> fetchMissingIntermediates(X509Certificate leafCert) {
List<X509Certificate> intermediates = new ArrayList<>();
// Get CA Issuer URLs from AIA extension
List<String> caIssuerUrls = AIAExtractor.getCaIssuerUrls(leafCert);
for (String url : caIssuerUrls) {
try {
// Check cache first
X509Certificate cached = cache.getIfPresent(url);
if (cached != null) {
intermediates.add(cached);
continue;
}
// Download certificate
X509Certificate downloaded = downloadCertificate(url);
if (downloaded != null) {
// Verify this certificate can actually sign the leaf
if (canSign(downloaded, leafCert)) {
intermediates.add(downloaded);
cache.put(url, downloaded);
logger.info("Downloaded certificate for url: {}", url);
} else {
logger.warn("Downloaded certificate cannot sign leaf cert from: {}", url);
}
}
} catch (Exception e) {
logger.error("Failed to fetch certificate from {}: {}", url, e.getMessage());
}
}
return intermediates;
}
private static X509Certificate downloadCertificate(String urlString) {
try {
ClassicHttpRequest request = ClassicRequestBuilder.create("GET")
.addHeader("User-Agent", WmsaHome.getUserAgent() + " (Certificate Fetcher)")
.setUri(urlString)
.build();
byte[] data = client.execute(request, rsp -> {
var entity = rsp.getEntity();
if (entity == null) {
logger.warn("GET request returned no content for {}", urlString);
return null;
}
return entity.getContent().readAllBytes();
});
if (data.length == 0) {
logger.warn("Empty response from {}", urlString);
return null;
}
// Try different formats based on file extension
if (urlString.toLowerCase().endsWith(".p7c") || urlString.toLowerCase().endsWith(".p7b")) {
return parsePKCS7(data);
} else {
return parseX509(data);
}
} catch (Exception e) {
logger.warn("Failed to fetch certificate from {}: {}", urlString, e.getMessage());
return null;
}
}
private static List<X509Certificate> parseMultiplePEM(byte[] data) throws Exception {
List<X509Certificate> certificates = new ArrayList<>();
try (StringReader stringReader = new StringReader(new String(data, StandardCharsets.UTF_8));
PEMParser pemParser = new PEMParser(stringReader)) {
JcaX509CertificateConverter converter = new JcaX509CertificateConverter();
Object object;
while ((object = pemParser.readObject()) != null) {
if (object instanceof X509CertificateHolder) {
X509CertificateHolder certHolder = (X509CertificateHolder) object;
certificates.add(converter.getCertificate(certHolder));
} else if (object instanceof X509Certificate) {
certificates.add((X509Certificate) object);
}
}
}
return certificates;
}
private static X509Certificate parseX509(byte[] data) throws Exception {
CertificateFactory cf = CertificateFactory.getInstance("X.509");
return (X509Certificate) cf.generateCertificate(new ByteArrayInputStream(data));
}
private static X509Certificate parsePKCS7(byte[] data) throws Exception {
try {
// Parse PKCS#7/CMS structure
CMSSignedData cmsData = new CMSSignedData(data);
Store<X509CertificateHolder> certStore = cmsData.getCertificates();
JcaX509CertificateConverter converter = new JcaX509CertificateConverter();
// Get the first certificate from the store
for (X509CertificateHolder certHolder : certStore.getMatches(null)) {
X509Certificate cert = converter.getCertificate(certHolder);
return cert;
}
logger.warn("No certificates found in PKCS#7 structure");
return null;
} catch (Exception e) {
logger.error("Failed to parse PKCS#7 structure from {}: {}", data.length, e.getMessage());
return parseX509(data);
}
}
private static boolean canSign(X509Certificate issuerCert, X509Certificate subjectCert) {
try {
// Check if the issuer DN matches
if (!issuerCert.getSubjectDN().equals(subjectCert.getIssuerDN())) {
return false;
}
// Try to verify the signature
subjectCert.verify(issuerCert.getPublicKey());
return true;
} catch (Exception e) {
return false;
}
}
// Recursive fetching for complete chains
public static List<X509Certificate> buildCompleteChain(X509Certificate leafCert) {
List<X509Certificate> completeChain = new ArrayList<>();
completeChain.add(leafCert);
X509Certificate currentCert = leafCert;
int maxDepth = 10; // Prevent infinite loops
while (maxDepth-- > 0) {
// If current cert is self-signed (root), we're done
if (currentCert.getSubjectDN().equals(currentCert.getIssuerDN())) {
break;
}
// Try to find the issuer
List<X509Certificate> intermediates = fetchMissingIntermediates(currentCert);
if (intermediates.isEmpty()) {
logger.error("Could not find issuer for: {}", currentCert.getSubjectDN());
break;
}
// Add the first valid intermediate
X509Certificate intermediate = intermediates.get(0);
completeChain.add(intermediate);
currentCert = intermediate;
}
return completeChain;
}
// Add this to your AIAExtractor class if not already present
public static List<String> getOCSPUrls(X509Certificate certificate) {
List<String> ocspUrls = new ArrayList<>();
try {
byte[] aiaExtensionValue = certificate.getExtensionValue(Extension.authorityInfoAccess.getId());
if (aiaExtensionValue == null) {
return ocspUrls;
}
ASN1OctetString octetString = ASN1OctetString.getInstance(aiaExtensionValue);
ASN1Primitive aiaObj = ASN1Primitive.fromByteArray(octetString.getOctets());
AuthorityInformationAccess aia = AuthorityInformationAccess.getInstance(aiaObj);
if (aia != null) {
AccessDescription[] accessDescriptions = aia.getAccessDescriptions();
for (AccessDescription accessDesc : accessDescriptions) {
if (X509ObjectIdentifiers.id_ad_ocsp.equals(accessDesc.getAccessMethod())) {
GeneralName accessLocation = accessDesc.getAccessLocation();
if (accessLocation.getTagNo() == GeneralName.uniformResourceIdentifier) {
String url = accessLocation.getName().toString();
ocspUrls.add(url);
}
}
}
}
} catch (Exception e) {
logger.error("Error parsing AIA extension for OCSP: {}", e.getMessage());
}
return ocspUrls;
}
public static Set<TrustAnchor> getRootCerts(String bundleUrl) throws Exception {
ClassicHttpRequest request = ClassicRequestBuilder.create("GET")
.addHeader("User-Agent", WmsaHome.getUserAgent() + " (Certificate Fetcher)")
.setUri(bundleUrl)
.build();
byte[] data = client.execute(request, rsp -> {
var entity = rsp.getEntity();
if (entity == null) {
logger.warn("GET request returned no content for {}", bundleUrl);
return null;
}
return entity.getContent().readAllBytes();
});
List<TrustAnchor> anchors = new ArrayList<>();
for (var cert : parseMultiplePEM(data)) {
try {
anchors.add(new TrustAnchor(cert, null));
} catch (Exception e) {
logger.warn("Failed to create TrustAnchor for certificate: {}", e.getMessage());
}
}
logger.info("Loaded {} root certificates from {}", anchors.size(), bundleUrl);
return Set.copyOf(anchors);
}
}

View File

@@ -0,0 +1,493 @@
package nu.marginalia.ping.ssl;
import org.bouncycastle.asn1.ASN1OctetString;
import org.bouncycastle.asn1.ASN1Primitive;
import org.bouncycastle.asn1.x509.*;
import javax.security.auth.x500.X500Principal;
import java.security.cert.TrustAnchor;
import java.security.cert.X509Certificate;
import java.util.*;
/** Utility class for validating X.509 certificates.
* This class provides methods to validate certificate chains, check expiration,
* hostname validity, and revocation status.
* <p></p>
* This is extremely unsuitable for actual SSL/TLS validation,
* and is only to be used in analyzing certificates for fingerprinting
* and diagnosing servers!
*/
public class CertificateValidator {
// If true, will attempt to fetch missing intermediate certificates via AIA urls.
private static final boolean TRY_FETCH_MISSING_CERTS = false;
public static class ValidationResult {
public boolean chainValid = false;
public boolean certificateExpired = false;
public boolean certificateRevoked = false;
public boolean selfSigned = false;
public boolean hostnameValid = false;
public boolean isValid() {
return !selfSigned && !certificateExpired && !certificateRevoked && hostnameValid;
}
public List<String> errors = new ArrayList<>();
public List<String> warnings = new ArrayList<>();
public Map<String, Object> details = new HashMap<>();
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("=== Certificate Validation Result ===\n");
sb.append("Chain Valid: ").append(chainValid ? "" : "").append("\n");
sb.append("Not Expired: ").append(!certificateExpired ? "" : "").append("\n");
sb.append("Not Revoked: ").append(!certificateRevoked ? "" : "").append("\n");
sb.append("Hostname Valid: ").append(hostnameValid ? "" : "").append("\n");
sb.append("Self-Signed: ").append(selfSigned ? "" : "").append("\n");
if (!errors.isEmpty()) {
sb.append("\nErrors:\n");
for (String error : errors) {
sb.append("").append(error).append("\n");
}
}
if (!warnings.isEmpty()) {
sb.append("\nWarnings:\n");
for (String warning : warnings) {
sb.append("").append(warning).append("\n");
}
}
if (!details.isEmpty()) {
sb.append("\nDetails:\n");
for (Map.Entry<String, Object> entry : details.entrySet()) {
sb.append(" ").append(entry.getKey()).append(": ").append(entry.getValue()).append("\n");
}
}
return sb.toString();
}
}
public static ValidationResult validateCertificate(X509Certificate[] certChain,
String hostname) {
return validateCertificate(certChain, hostname, false);
}
public static ValidationResult validateCertificate(X509Certificate[] certChain,
String hostname,
boolean autoTrustFetchedRoots) {
ValidationResult result = new ValidationResult();
if (certChain == null || certChain.length == 0) {
result.errors.add("No certificates provided");
return result;
}
X509Certificate leafCert = certChain[0];
// 1. Check certificate expiration
result.certificateExpired = checkExpiration(leafCert, result);
// 2. Check hostname validity
result.hostnameValid = checkHostname(leafCert, hostname, result);
// 3. Not really checking if it's self-signed, but if the chain is incomplete (and likely self-signed)
result.selfSigned = certChain.length <= 1;
// 4. Check certificate chain validity (optionally with AIA fetching)
result.chainValid = checkChainValidity(certChain, RootCerts.getTrustAnchors(), result, autoTrustFetchedRoots);
// 5. Check revocation status
result.certificateRevoked = false; // not implemented
// checkRevocation(leafCert, result);
return result;
}
private static boolean checkExpiration(X509Certificate cert, ValidationResult result) {
try {
cert.checkValidity();
result.details.put("validFrom", cert.getNotBefore());
result.details.put("validTo", cert.getNotAfter());
// Warn if expires soon (30 days)
long daysUntilExpiry = (cert.getNotAfter().getTime() - System.currentTimeMillis()) / (1000 * 60 * 60 * 24);
if (daysUntilExpiry < 30) {
result.warnings.add("Certificate expires in " + daysUntilExpiry + " days");
}
return false; // Not expired
} catch (Exception e) {
result.errors.add("Certificate expired or not yet valid: " + e.getMessage());
return true; // Expired
}
}
private static boolean checkHostname(X509Certificate cert, String hostname, ValidationResult result) {
if (hostname == null || hostname.isEmpty()) {
result.warnings.add("No hostname provided for validation");
return false;
}
try {
// Check Subject CN
String subjectCN = getCommonName(cert.getSubjectX500Principal());
if (subjectCN != null && matchesHostname(subjectCN, hostname)) {
result.details.put("hostnameMatchedBy", "Subject CN: " + subjectCN);
return true;
}
// Check Subject Alternative Names
Collection<List<?>> subjectAltNames = cert.getSubjectAlternativeNames();
if (subjectAltNames != null) {
for (List<?> altName : subjectAltNames) {
if (altName.size() >= 2) {
Integer type = (Integer) altName.get(0);
if (type == 2) { // DNS name
String dnsName = (String) altName.get(1);
if (matchesHostname(dnsName, hostname)) {
result.details.put("hostnameMatchedBy", "SAN DNS: " + dnsName);
return true;
}
}
}
}
}
result.errors.add("Hostname '" + hostname + "' does not match certificate");
result.details.put("subjectCN", subjectCN);
result.details.put("subjectAltNames", subjectAltNames);
return false;
} catch (Exception e) {
result.errors.add("Error checking hostname: " + e.getMessage());
return false;
}
}
private static boolean checkChainValidity(X509Certificate[] originalChain,
Set<TrustAnchor> trustAnchors,
ValidationResult result,
boolean autoTrustFetchedRoots) {
try {
// First try with the original chain
ChainValidationResult originalResult = validateChain(originalChain, trustAnchors);
if (originalResult.isValid) {
result.details.put("chainLength", originalChain.length);
result.details.put("chainExtended", false);
return true;
}
else if (!TRY_FETCH_MISSING_CERTS) {
result.errors.addAll(originalResult.issues);
result.details.put("chainLength", originalChain.length);
result.details.put("chainExtended", false);
return false;
}
try {
List<X509Certificate> repairedChain = CertificateFetcher.buildCompleteChain(originalChain[0]);
if (!repairedChain.isEmpty()) {
X509Certificate[] extendedArray = repairedChain.toArray(new X509Certificate[0]);
// Create a copy of trust anchors for potential modification
Set<TrustAnchor> workingTrustAnchors = new HashSet<>(trustAnchors);
// If auto-trust is enabled, add any self-signed certs as trusted roots
if (autoTrustFetchedRoots) {
for (X509Certificate cert : extendedArray) {
if (cert.getSubjectX500Principal().equals(cert.getIssuerX500Principal())) {
// Self-signed certificate - add to trust anchors if not already there
boolean alreadyTrusted = false;
for (TrustAnchor anchor : workingTrustAnchors) {
if (anchor.getTrustedCert().equals(cert)) {
alreadyTrusted = true;
break;
}
}
if (!alreadyTrusted) {
workingTrustAnchors.add(new TrustAnchor(cert, null));
result.warnings.add("Auto-trusted fetched root: " + cert.getSubjectX500Principal().getName());
}
}
}
}
ChainValidationResult extendedResult = validateChain(extendedArray, workingTrustAnchors);
result.details.put("chainLength", extendedArray.length);
result.details.put("originalChainLength", originalChain.length);
result.details.put("chainExtended", true);
result.details.put("fetchedIntermediates", extendedArray.length);
result.details.put("autoTrustedRoots", autoTrustFetchedRoots);
if (extendedResult.isValid) {
result.warnings.add("Extended certificate chain with " + extendedArray.length + " fetched intermediates");
return true;
} else {
result.errors.addAll(extendedResult.issues);
return false;
}
} else {
result.warnings.add("Could not fetch missing intermediate certificates");
result.details.put("chainLength", originalChain.length);
result.details.put("chainExtended", false);
result.errors.addAll(originalResult.issues);
return false;
}
} catch (Exception e) {
result.warnings.add("Failed to fetch intermediates: " + e.getMessage());
result.details.put("chainLength", originalChain.length);
result.details.put("chainExtended", false);
result.errors.addAll(originalResult.issues);
return false;
}
} catch (Exception e) {
result.errors.add("Error validating chain: " + e.getMessage());
return false;
}
}
private static void debugCertificateChain(List<X509Certificate> certs, Set<TrustAnchor> trustAnchors) {
System.out.println("=== Certificate Chain Analysis ===");
int length = certs.size();
System.out.println("Chain length: " + length);
int i = 0;
for (var x509cert : certs) {
System.out.println("\nCertificate " + i++ + ":");
System.out.println(" Subject: " + x509cert.getSubjectDN().getName());
System.out.println(" Issuer: " + x509cert.getIssuerDN().getName());
System.out.println(" Serial: " + x509cert.getSerialNumber().toString(16));
System.out.println(" Valid: " + x509cert.getNotBefore() + " to " + x509cert.getNotAfter());
System.out.println(" Self-signed: " + x509cert.getSubjectDN().equals(x509cert.getIssuerDN()));
// Check if we have the issuer in our trust anchors
boolean issuerFound = false;
for (TrustAnchor anchor : trustAnchors) {
if (anchor.getTrustedCert().getSubjectDN().equals(x509cert.getIssuerDN())) {
issuerFound = true;
System.out.println(" Issuer found in trust anchors: " + anchor.getTrustedCert().getSubjectDN().getName());
break;
}
}
if (!issuerFound && i == length) {
System.out.println(" *** MISSING ISSUER: " + x509cert.getIssuerDN().getName());
}
}
}
private static class ChainValidationResult {
boolean isValid = false;
List<String> issues = new ArrayList<>();
}
private static ChainValidationResult validateChain(X509Certificate[] certChain, Set<TrustAnchor> trustAnchors) {
ChainValidationResult result = new ChainValidationResult();
// Check each certificate in the chain
for (int i = 0; i < certChain.length; i++) {
X509Certificate cert = certChain[i];
// Check certificate validity dates
try {
cert.checkValidity();
} catch (Exception e) {
result.issues.add("Certificate " + i + " expired: " + cert.getSubjectDN());
}
// Check signature (except for self-signed root)
if (i < certChain.length - 1) {
X509Certificate issuer = certChain[i + 1];
try {
cert.verify(issuer.getPublicKey());
} catch (Exception e) {
result.issues.add("Certificate " + i + " signature invalid: " + e.getMessage());
}
// Check issuer/subject relationship
if (!cert.getIssuerX500Principal().equals(issuer.getSubjectX500Principal())) {
result.issues.add("Certificate " + i + " issuer does not match certificate " + (i + 1) + " subject");
}
}
}
// Check if chain ends with a trusted root
X509Certificate rootCert = certChain[certChain.length - 1];
boolean trustedRootFound = false;
if (rootCert.getSubjectX500Principal().equals(rootCert.getIssuerX500Principal())) {
// Self-signed root - check if it's in trust anchors
for (TrustAnchor anchor : trustAnchors) {
if (anchor.getTrustedCert().equals(rootCert)) {
trustedRootFound = true;
break;
}
}
if (!trustedRootFound) {
// Check if we trust the root's subject even if the certificate is different
for (TrustAnchor anchor : trustAnchors) {
if (anchor.getTrustedCert().getSubjectX500Principal().equals(rootCert.getSubjectX500Principal())) {
trustedRootFound = true;
// Note: we'll add this as a warning in the main result
break;
}
}
}
} else {
// Chain doesn't end with self-signed cert - check if issuer is trusted
for (TrustAnchor anchor : trustAnchors) {
if (anchor.getTrustedCert().getSubjectX500Principal().equals(rootCert.getIssuerX500Principal())) {
trustedRootFound = true;
break;
}
}
}
if (!trustedRootFound) {
result.issues.add("Chain does not end with a trusted root");
}
result.isValid = result.issues.isEmpty();
return result;
}
private static boolean checkRevocation(X509Certificate cert, ValidationResult result) {
try {
// Try OCSP first
if (checkOCSP(cert, result)) {
return true; // Revoked
}
// Fallback to CRL
if (checkCRL(cert, result)) {
return true; // Revoked
}
result.warnings.add("Could not check revocation status");
return false; // Assume not revoked if we can't check
} catch (Exception e) {
result.warnings.add("Error checking revocation: " + e.getMessage());
return false;
}
}
private static boolean checkOCSP(X509Certificate cert, ValidationResult result) {
// For now, just extract OCSP URL and note that we found it
try {
List<String> ocspUrls = CertificateFetcher.getOCSPUrls(cert);
if (!ocspUrls.isEmpty()) {
result.details.put("ocspUrls", ocspUrls);
result.warnings.add("OCSP checking not implemented - found OCSP URLs: " + ocspUrls);
}
return false;
} catch (Exception e) {
return false;
}
}
private static boolean checkCRL(X509Certificate cert, ValidationResult result) {
// Basic CRL URL extraction
try {
List<String> crlUrls = getCRLUrls(cert);
if (!crlUrls.isEmpty()) {
result.details.put("crlUrls", crlUrls);
result.warnings.add("CRL checking not implemented - found CRL URLs: " + crlUrls);
}
return false;
} catch (Exception e) {
return false;
}
}
// Helper methods
private static String getCommonName(X500Principal principal) {
String name = principal.getName();
String[] parts = name.split(",");
for (String part : parts) {
part = part.trim();
if (part.startsWith("CN=")) {
return part.substring(3);
}
}
return null;
}
private static boolean matchesHostname(String certName, String hostname) {
if (certName == null || hostname == null) {
return false;
}
// Exact match
if (certName.equalsIgnoreCase(hostname)) {
return true;
}
// Wildcard match
if (certName.startsWith("*.")) {
String certDomain = certName.substring(2);
String hostDomain = hostname;
int firstDot = hostname.indexOf('.');
if (firstDot > 0) {
hostDomain = hostname.substring(firstDot + 1);
}
return certDomain.equalsIgnoreCase(hostDomain);
}
return false;
}
private static List<String> getCRLUrls(X509Certificate cert) {
// This would need to parse the CRL Distribution Points extension
// For now, return empty list
return new ArrayList<>();
}
// Add this to your AIAExtractor class if not already present
public static List<String> getOCSPUrls(X509Certificate certificate) {
List<String> ocspUrls = new ArrayList<>();
try {
byte[] aiaExtensionValue = certificate.getExtensionValue(Extension.authorityInfoAccess.getId());
if (aiaExtensionValue == null) {
return ocspUrls;
}
ASN1OctetString octetString = ASN1OctetString.getInstance(aiaExtensionValue);
ASN1Primitive aiaObj = ASN1Primitive.fromByteArray(octetString.getOctets());
AuthorityInformationAccess aia = AuthorityInformationAccess.getInstance(aiaObj);
if (aia != null) {
AccessDescription[] accessDescriptions = aia.getAccessDescriptions();
for (AccessDescription accessDesc : accessDescriptions) {
if (X509ObjectIdentifiers.id_ad_ocsp.equals(accessDesc.getAccessMethod())) {
GeneralName accessLocation = accessDesc.getAccessLocation();
if (accessLocation.getTagNo() == GeneralName.uniformResourceIdentifier) {
String url = accessLocation.getName().toString();
ocspUrls.add(url);
}
}
}
}
} catch (Exception e) {
System.err.println("Error parsing AIA extension for OCSP: " + e.getMessage());
}
return ocspUrls;
}
}

View File

@@ -1,491 +0,0 @@
package nu.marginalia.ping.ssl;
import javax.net.ssl.*;
import java.io.FileInputStream;
import java.security.InvalidAlgorithmParameterException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.*;
import java.util.*;
/**
* Custom PKIX validator for validating X.509 certificate chains with verbose output
* for db export (i.e. not just SSLException).
*/
public class CustomPKIXValidator {
private final Set<TrustAnchor> trustAnchors;
private final boolean revocationEnabled;
private final boolean anyPolicyInhibited;
private final boolean explicitPolicyRequired;
private final boolean policyMappingInhibited;
private final Set<String> initialPolicies;
private static final Set<String> EV_POLICY_OIDS = Set.of(
"1.3.6.1.4.1.17326.10.14.2.1.2", // Entrust
"1.3.6.1.4.1.17326.10.8.12.1.2", // Entrust
"2.16.840.1.114028.10.1.2", // Entrust/AffirmTrust
"1.3.6.1.4.1.6449.1.2.1.5.1", // Comodo
"1.3.6.1.4.1.8024.0.2.100.1.2", // QuoVadis
"2.16.840.1.114404.1.1.2.4.1", // GoDaddy
"2.16.840.1.114413.1.7.23.3", // DigiCert
"2.16.840.1.114414.1.7.23.3", // DigiCert
"1.3.6.1.4.1.14370.1.6", // GlobalSign
"2.16.756.1.89.1.2.1.1", // SwissSign
"1.3.6.1.4.1.4146.1.1" // GlobalSign
);
// Constructor with default settings
public CustomPKIXValidator() throws Exception {
this(true, false, false, false, null);
}
// Constructor with custom settings
public CustomPKIXValidator(boolean revocationEnabled,
boolean anyPolicyInhibited,
boolean explicitPolicyRequired,
boolean policyMappingInhibited,
Set<String> initialPolicies) throws Exception {
this.trustAnchors = loadDefaultTrustAnchors();
this.revocationEnabled = revocationEnabled;
this.anyPolicyInhibited = anyPolicyInhibited;
this.explicitPolicyRequired = explicitPolicyRequired;
this.policyMappingInhibited = policyMappingInhibited;
this.initialPolicies = initialPolicies;
}
// Constructor with custom trust anchors
public CustomPKIXValidator(Set<TrustAnchor> customTrustAnchors,
boolean revocationEnabled) {
this.trustAnchors = new HashSet<>(customTrustAnchors);
this.revocationEnabled = revocationEnabled;
this.anyPolicyInhibited = false;
this.explicitPolicyRequired = false;
this.policyMappingInhibited = false;
this.initialPolicies = null;
}
/**
* Validates certificate chain using PKIX algorithm
*/
public PKIXValidationResult validateCertificateChain(String hostname, X509Certificate[] certChain) {
EnumSet<PkixValidationError> errors = EnumSet.noneOf(PkixValidationError.class);
try {
// 1. Basic input validation
if (certChain == null || certChain.length == 0) {
return new PKIXValidationResult(false, "Certificate chain is empty", errors,
null, null, null, false);
}
if (hostname == null || hostname.trim().isEmpty()) {
return new PKIXValidationResult(false, "Hostname is null or empty", errors,
null, null, null, false);
}
// 2. Create certificate path
CertPath certPath = createCertificatePath(certChain);
if (certPath == null) {
return new PKIXValidationResult(false, "Failed to create certificate path", errors,
null, null, null, false);
}
// 3. Build and validate certificate path using PKIX
PKIXCertPathValidatorResult pkixResult = performPKIXValidation(certPath, errors);
// 4. Validate hostname
boolean hostnameValid = validateHostname(hostname, certChain[0], errors);
// 5. Extract critical extensions information
Set<String> criticalExtensions = extractCriticalExtensions(certChain);
boolean overallValid = (pkixResult != null) && hostnameValid;
String errorMessage = null;
if (pkixResult == null) {
errorMessage = "PKIX path validation failed";
} else if (!hostnameValid) {
errorMessage = "Hostname validation failed";
}
return new PKIXValidationResult(overallValid, errorMessage, errors,
pkixResult, certPath, criticalExtensions, hostnameValid);
} catch (Exception e) {
return new PKIXValidationResult(false, "Validation exception: " + e.getMessage(),
errors, null, null, null, false);
}
}
/**
* Creates a certificate path from the certificate chain
*/
private CertPath createCertificatePath(X509Certificate[] certChain) throws CertificateException {
CertificateFactory cf = CertificateFactory.getInstance("X.509");
List<Certificate> certList = Arrays.asList(certChain);
return cf.generateCertPath(certList);
}
/**
* Performs PKIX validation
*/
private PKIXCertPathValidatorResult performPKIXValidation(CertPath certPath, Set<PkixValidationError> warnings) {
try {
// Create PKIX parameters
PKIXParameters params = new PKIXParameters(trustAnchors);
// Configure PKIX parameters
params.setRevocationEnabled(revocationEnabled);
params.setAnyPolicyInhibited(anyPolicyInhibited);
params.setExplicitPolicyRequired(explicitPolicyRequired);
params.setPolicyMappingInhibited(policyMappingInhibited);
if (initialPolicies != null && !initialPolicies.isEmpty()) {
params.setInitialPolicies(initialPolicies);
}
// Set up certificate stores for intermediate certificates if needed
// This helps with path building when intermediate certs are missing
List<Certificate> intermediateCerts = extractIntermediateCertificates(certPath);
if (!intermediateCerts.isEmpty()) {
CertStore certStore = CertStore.getInstance("Collection",
new CollectionCertStoreParameters(intermediateCerts));
params.addCertStore(certStore);
}
// Configure revocation checking if enabled
if (revocationEnabled) {
configureRevocationChecking(params);
}
// Create and run validator
CertPathValidator validator = CertPathValidator.getInstance("PKIX");
PKIXCertPathValidatorResult result = (PKIXCertPathValidatorResult)
validator.validate(certPath, params);
return result;
} catch (CertPathValidatorException e) {
warnings.add(PkixValidationError.PATH_VALIDATION_FAILED);
return null;
} catch (InvalidAlgorithmParameterException e) {
warnings.add(PkixValidationError.INVALID_PKIX_PARAMETERS);
return null;
} catch (Exception e) {
warnings.add(PkixValidationError.UNKNOWN);
return null;
}
}
/**
* Extracts intermediate certificates from the path
*/
private List<Certificate> extractIntermediateCertificates(CertPath certPath) {
List<Certificate> certs = (List<Certificate>) certPath.getCertificates();
if (certs.size() <= 2) {
return new ArrayList<>(); // Only leaf and root, no intermediates
}
// Return all but the first (leaf) and potentially last (root)
return new ArrayList<>(certs.subList(1, certs.size()));
}
/**
* Configures revocation checking (CRL/OCSP)
*/
private void configureRevocationChecking(PKIXParameters params) throws NoSuchAlgorithmException {
// Create PKIX revocation checker
PKIXRevocationChecker revocationChecker = (PKIXRevocationChecker)
CertPathValidator.getInstance("PKIX").getRevocationChecker();
// Configure revocation checker options
Set<PKIXRevocationChecker.Option> options = EnumSet.of(
PKIXRevocationChecker.Option.PREFER_CRLS,
PKIXRevocationChecker.Option.SOFT_FAIL // Don't fail if revocation info unavailable
);
revocationChecker.setOptions(options);
params.addCertPathChecker(revocationChecker);
}
/**
* Comprehensive hostname validation including SAN and CN
*/
private boolean validateHostname(String hostname, X509Certificate cert, Set<PkixValidationError> warnings) {
try {
// Use Java's built-in hostname verifier as a starting point
HostnameVerifier defaultVerifier = HttpsURLConnection.getDefaultHostnameVerifier();
// Create a mock SSL session for the hostname verifier
MockSSLSession mockSession = new MockSSLSession(cert);
boolean defaultResult = defaultVerifier.verify(hostname, mockSession);
if (defaultResult) {
return true;
}
// If default fails, do manual validation
return performManualHostnameValidation(hostname, cert, warnings);
} catch (Exception e) {
warnings.add(PkixValidationError.UNSPECIFIED_HOST_ERROR);
return false;
}
}
/**
* Manual hostname validation implementation
*/
private boolean performManualHostnameValidation(String hostname, X509Certificate cert, Set<PkixValidationError> warnings) {
try {
// 1. Check Subject Alternative Names (SAN) - preferred method
Collection<List<?>> sanEntries = cert.getSubjectAlternativeNames();
if (sanEntries != null) {
for (List<?> sanEntry : sanEntries) {
if (sanEntry.size() >= 2) {
Integer type = (Integer) sanEntry.get(0);
if (type == 2) { // DNS name
String dnsName = (String) sanEntry.get(1);
if (matchesHostname(hostname, dnsName)) {
return true;
}
} else if (type == 7) { // IP address
String ipAddress = (String) sanEntry.get(1);
if (hostname.equals(ipAddress)) {
return true;
}
}
}
}
// If SAN is present but no match found, don't check CN (RFC 6125)
warnings.add(PkixValidationError.SAN_MISMATCH);
return false;
}
// 2. Fallback to Common Name (CN) in subject if no SAN present
String subjectDN = cert.getSubjectDN().getName();
String cn = extractCommonName(subjectDN);
if (cn != null) {
if (matchesHostname(hostname, cn)) {
return true;
}
}
warnings.add(PkixValidationError.SAN_MISMATCH);
return false;
} catch (Exception e) {
warnings.add(PkixValidationError.UNKNOWN);
return false;
}
}
/**
* Checks if hostname matches certificate name (handles wildcards)
*/
private boolean matchesHostname(String hostname, String certName) {
if (hostname == null || certName == null) {
return false;
}
hostname = hostname.toLowerCase();
certName = certName.toLowerCase();
// Exact match
if (hostname.equals(certName)) {
return true;
}
// Wildcard matching (*.example.com)
if (certName.startsWith("*.")) {
String domain = certName.substring(2);
// Wildcard must match exactly one level
if (hostname.endsWith("." + domain)) {
String prefix = hostname.substring(0, hostname.length() - domain.length() - 1);
// Ensure wildcard doesn't match multiple levels (no dots in prefix)
return !prefix.contains(".");
}
}
return false;
}
/**
* Extracts Common Name from Subject DN
*/
private String extractCommonName(String subjectDN) {
if (subjectDN == null) {
return null;
}
// Parse DN components
String[] components = subjectDN.split(",");
for (String component : components) {
component = component.trim();
if (component.startsWith("CN=")) {
return component.substring(3).trim();
}
}
return null;
}
/**
* Extracts critical extensions from all certificates in the chain
*/
private Set<String> extractCriticalExtensions(X509Certificate[] certChain) {
Set<String> allCriticalExtensions = new HashSet<>();
for (X509Certificate cert : certChain) {
Set<String> criticalExtensions = cert.getCriticalExtensionOIDs();
if (criticalExtensions != null) {
allCriticalExtensions.addAll(criticalExtensions);
}
}
return allCriticalExtensions;
}
/**
* Gets the key length from a certificate
*/
private int getKeyLength(X509Certificate cert) {
try {
java.security.PublicKey publicKey = cert.getPublicKey();
if (publicKey instanceof java.security.interfaces.RSAPublicKey) {
return ((java.security.interfaces.RSAPublicKey) publicKey).getModulus().bitLength();
} else if (publicKey instanceof java.security.interfaces.DSAPublicKey) {
return ((java.security.interfaces.DSAPublicKey) publicKey).getParams().getP().bitLength();
} else if (publicKey instanceof java.security.interfaces.ECPublicKey) {
return ((java.security.interfaces.ECPublicKey) publicKey).getParams().getOrder().bitLength();
}
} catch (Exception e) {
// Ignore
}
return -1;
}
/**
* Checks if signature algorithm is considered weak
*/
private boolean isWeakSignatureAlgorithm(String sigAlg) {
if (sigAlg == null) return false;
sigAlg = sigAlg.toLowerCase();
return sigAlg.contains("md5") ||
sigAlg.contains("sha1") ||
sigAlg.equals("md2withrsa") ||
sigAlg.equals("md4withrsa");
}
/**
* Checks for deprecated or problematic extensions
*/
private void checkDeprecatedExtensions(X509Certificate cert, int index, List<String> warnings) {
// Check for Netscape extensions (deprecated)
if (cert.getNonCriticalExtensionOIDs() != null) {
for (String oid : cert.getNonCriticalExtensionOIDs()) {
if (oid.startsWith("2.16.840.1.113730")) { // Netscape OID space
warnings.add("Certificate " + index + " contains deprecated Netscape extension: " + oid);
}
}
}
// Additional extension checks can be added here
}
/**
* Loads default trust anchors from Java's cacerts keystore
*/
private Set<TrustAnchor> loadDefaultTrustAnchors() throws Exception {
Set<TrustAnchor> trustAnchors = new HashSet<>();
// Try to load from default locations
String[] keystorePaths = {
System.getProperty("javax.net.ssl.trustStore"),
System.getProperty("java.home") + "/lib/security/cacerts",
System.getProperty("java.home") + "/jre/lib/security/cacerts"
};
String[] keystorePasswords = {
System.getProperty("javax.net.ssl.trustStorePassword"),
"changeit",
""
};
for (String keystorePath : keystorePaths) {
if (keystorePath != null) {
for (String password : keystorePasswords) {
try {
KeyStore trustStore = loadKeyStore(keystorePath, password);
if (trustStore != null) {
trustAnchors.addAll(extractTrustAnchors(trustStore));
if (!trustAnchors.isEmpty()) {
return trustAnchors;
}
}
} catch (Exception e) {
// Try next combination
}
}
}
}
// Fallback: try to get from default trust manager
try {
TrustManagerFactory tmf = TrustManagerFactory.getInstance(
TrustManagerFactory.getDefaultAlgorithm());
tmf.init((KeyStore) null);
for (TrustManager tm : tmf.getTrustManagers()) {
if (tm instanceof X509TrustManager) {
X509TrustManager x509tm = (X509TrustManager) tm;
for (X509Certificate cert : x509tm.getAcceptedIssuers()) {
trustAnchors.add(new TrustAnchor(cert, null));
}
}
}
} catch (Exception e) {
throw new Exception("Failed to load any trust anchors", e);
}
if (trustAnchors.isEmpty()) {
throw new Exception("No trust anchors could be loaded");
}
return trustAnchors;
}
/**
* Loads a keystore from file
*/
private KeyStore loadKeyStore(String keystorePath, String password) throws Exception {
KeyStore keystore = KeyStore.getInstance(KeyStore.getDefaultType());
try (FileInputStream fis = new FileInputStream(keystorePath)) {
keystore.load(fis, password != null ? password.toCharArray() : null);
return keystore;
}
}
/**
* Extracts trust anchors from a keystore
*/
private Set<TrustAnchor> extractTrustAnchors(KeyStore trustStore) throws KeyStoreException {
Set<TrustAnchor> trustAnchors = new HashSet<>();
Enumeration<String> aliases = trustStore.aliases();
while (aliases.hasMoreElements()) {
String alias = aliases.nextElement();
if (trustStore.isCertificateEntry(alias)) {
Certificate cert = trustStore.getCertificate(alias);
if (cert instanceof X509Certificate) {
trustAnchors.add(new TrustAnchor((X509Certificate) cert, null));
}
}
}
return trustAnchors;
}
}

View File

@@ -1,116 +0,0 @@
package nu.marginalia.ping.ssl;
import javax.net.ssl.SSLPeerUnverifiedException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSessionContext;
import java.security.cert.Certificate;
import java.security.cert.X509Certificate;
/**
* Mock SSL session for hostname verification
*/
public class MockSSLSession implements SSLSession {
private final X509Certificate[] peerCertificates;
public MockSSLSession(X509Certificate cert) {
this.peerCertificates = new X509Certificate[]{cert};
}
@Override
public Certificate[] getPeerCertificates() throws SSLPeerUnverifiedException {
return peerCertificates;
}
// All other methods return default/empty values as they're not used by hostname verification
@Override
public byte[] getId() {
return new byte[0];
}
@Override
public SSLSessionContext getSessionContext() {
return null;
}
@Override
public long getCreationTime() {
return 0;
}
@Override
public long getLastAccessedTime() {
return 0;
}
@Override
public void invalidate() {
}
@Override
public boolean isValid() {
return true;
}
@Override
public void putValue(String name, Object value) {
}
@Override
public Object getValue(String name) {
return null;
}
@Override
public void removeValue(String name) {
}
@Override
public String[] getValueNames() {
return new String[0];
}
@Override
public java.security.Principal getPeerPrincipal() throws SSLPeerUnverifiedException {
return null;
}
@Override
public java.security.Principal getLocalPrincipal() {
return null;
}
@Override
public String getCipherSuite() {
return "";
}
@Override
public String getProtocol() {
return "";
}
@Override
public String getPeerHost() {
return "";
}
@Override
public int getPeerPort() {
return 0;
}
@Override
public int getPacketBufferSize() {
return 0;
}
@Override
public int getApplicationBufferSize() {
return 0;
}
@Override
public Certificate[] getLocalCertificates() {
return new Certificate[0];
}
}

View File

@@ -1,14 +0,0 @@
package nu.marginalia.ping.ssl;
import java.security.cert.CertPath;
import java.security.cert.PKIXCertPathValidatorResult;
import java.util.Set;
public record PKIXValidationResult(boolean isValid, String errorMessage,
Set<PkixValidationError> errors,
PKIXCertPathValidatorResult pkixResult,
CertPath validatedPath,
Set<String> criticalExtensions,
boolean hostnameValid)
{
}

View File

@@ -1,11 +0,0 @@
package nu.marginalia.ping.ssl;
public enum PkixValidationError {
SAN_MISMATCH,
EXPIRED,
NOT_YET_VALID,
PATH_VALIDATION_FAILED,
INVALID_PKIX_PARAMETERS,
UNKNOWN,
UNSPECIFIED_HOST_ERROR;
}

View File

@@ -0,0 +1,57 @@
package nu.marginalia.ping.ssl;
import java.security.cert.TrustAnchor;
import java.time.Duration;
import java.util.Set;
public class RootCerts {
private static final String MOZILLA_CA_BUNDLE_URL = "https://curl.se/ca/cacert.pem";
volatile static boolean initialized = false;
volatile static Set<TrustAnchor> trustAnchors;
public static Set<TrustAnchor> getTrustAnchors() {
if (!initialized) {
try {
synchronized (RootCerts.class) {
while (!initialized) {
RootCerts.class.wait(100);
}
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("RootCerts initialization interrupted", e);
}
}
return trustAnchors;
}
static {
Thread.ofPlatform()
.name("RootCertsUpdater")
.daemon()
.unstarted(RootCerts::updateTrustAnchors)
.start();
}
private static void updateTrustAnchors() {
while (true) {
try {
trustAnchors = CertificateFetcher.getRootCerts(MOZILLA_CA_BUNDLE_URL);
synchronized (RootCerts.class) {
initialized = true;
RootCerts.class.notifyAll(); // Notify any waiting threads
}
Thread.sleep(Duration.ofHours(24));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break; // Exit if interrupted
} catch (Exception e) {
// Log the exception and continue to retry
System.err.println("Failed to update trust anchors: " + e.getMessage());
}
}
}
}

View File

@@ -9,7 +9,7 @@ import nu.marginalia.ping.fetcher.response.HttpsResponse;
import nu.marginalia.ping.model.DomainAvailabilityRecord;
import nu.marginalia.ping.model.ErrorClassification;
import nu.marginalia.ping.model.HttpSchema;
import nu.marginalia.ping.ssl.PKIXValidationResult;
import nu.marginalia.ping.ssl.CertificateValidator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -124,7 +124,7 @@ public class DomainAvailabilityInformationFactory {
int nodeId,
@Nullable InetAddress address,
@Nullable DomainAvailabilityRecord previousRecord,
PKIXValidationResult validationResult,
CertificateValidator.ValidationResult validationResult,
HttpsResponse rsp) {
Instant updateTime;

View File

@@ -4,7 +4,7 @@ import nu.marginalia.ping.fetcher.response.HttpResponse;
import nu.marginalia.ping.fetcher.response.HttpsResponse;
import nu.marginalia.ping.model.DomainSecurityRecord;
import nu.marginalia.ping.model.HttpSchema;
import nu.marginalia.ping.ssl.PKIXValidationResult;
import nu.marginalia.ping.ssl.CertificateValidator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -52,7 +52,7 @@ public class DomainSecurityInformationFactory {
// HTTPS response
public DomainSecurityRecord createHttpsSecurityInformation(
HttpsResponse httpResponse,
PKIXValidationResult validationResult,
CertificateValidator.ValidationResult validationResult,
int domainId,
int nodeId,
@Nullable Integer asn
@@ -126,6 +126,9 @@ public class DomainSecurityInformationFactory {
.sslCertWildcard(isWildcard)
.sslCertificateChainLength(sslCertificates.length)
.sslCertificateValid(validationResult.isValid())
.sslHostValid(validationResult.hostnameValid)
.sslChainValid(validationResult.chainValid)
.sslDateValid(!validationResult.certificateExpired)
.httpVersion(httpResponse.version())
.tsLastUpdate(Instant.now())
.build();

View File

@@ -8,8 +8,7 @@ import nu.marginalia.ping.fetcher.response.*;
import nu.marginalia.ping.model.*;
import nu.marginalia.ping.model.comparison.DomainAvailabilityChange;
import nu.marginalia.ping.model.comparison.SecurityInformationChange;
import nu.marginalia.ping.ssl.CustomPKIXValidator;
import nu.marginalia.ping.ssl.PKIXValidationResult;
import nu.marginalia.ping.ssl.CertificateValidator;
import nu.marginalia.ping.util.JsonObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -32,9 +31,7 @@ public class HttpPingService {
private final DomainAvailabilityInformationFactory domainAvailabilityInformationFactory;
private final DomainSecurityInformationFactory domainSecurityInformationFactory;
private static final Logger logger = LoggerFactory.getLogger(HttpPingService.class);
CustomPKIXValidator validator;
@Inject
public HttpPingService(
@@ -46,7 +43,6 @@ public class HttpPingService {
this.pingHttpFetcher = pingHttpFetcher;
this.domainAvailabilityInformationFactory = domainAvailabilityInformationFactory;
this.domainSecurityInformationFactory = domainSecurityInformationFactory;
this.validator = new CustomPKIXValidator();
}
private int compareInetAddresses(InetAddress a, InetAddress b) {
@@ -164,7 +160,11 @@ public class HttpPingService {
);
}
case HttpsResponse httpsResponse -> {
PKIXValidationResult validationResult = validator.validateCertificateChain(domainReference.domainName(), (X509Certificate[]) httpsResponse.sslCertificates());
var validationResult = CertificateValidator.validateCertificate(
(X509Certificate[]) httpsResponse.sslCertificates(),
domainReference.domainName(),
true
);
newPingStatus = domainAvailabilityInformationFactory.createHttpsResponse(
domainReference.domainId(),

View File

@@ -243,6 +243,7 @@ class PingDaoTest {
.headerServer("Apache/2.4.41 (Ubuntu)")
.headerXPoweredBy("PHP/7.4.3")
.tsLastUpdate(Instant.now())
.sslHostValid(true)
.build();
var svc = new PingDao(dataSource);
svc.write(foo);

View File

@@ -60,6 +60,7 @@ class PingHttpServiceTest {
}
@Tag("flaky") // Do not run this test in CI
@Test
public void testGetSslInfo() throws Exception {
var provider = new HttpClientProvider();

View File

@@ -4,6 +4,7 @@ public class ProcessInboxNames {
public static final String CONVERTER_INBOX = "converter";
public static final String LOADER_INBOX = "loader";
public static final String PING_INBOX = "ping";
public static final String NDP_INBOX = "ndp";
public static final String CRAWLER_INBOX = "crawler";
public static final String LIVE_CRAWLER_INBOX = "live-crawler";

View File

@@ -0,0 +1,4 @@
package nu.marginalia.mqapi.ndp;
public record NdpRequest(int goal) {
}

View File

@@ -25,6 +25,11 @@ into the [MariaDB database](../common/db).
The [index-construction-process](index-constructor-process/) constructs indices from
the data generated by the loader.
## 5. Other Processes
* Ping Process: The [ping-process](ping-process/) keeps track of the aliveness of websites, gathering fingerprint information about the security posture of the website, as well as DNS information.
* Live-Crawling Process: The [live-crawling-process](live-crawling-process/) is a process that crawls websites in real-time based on RSS feeds, updating a smaller index with the latest content.
## Overview
Schematically the crawling and loading process looks like this:

View File

@@ -22,6 +22,7 @@ import nu.marginalia.search.model.NavbarModel;
import nu.marginalia.search.model.ResultsPage;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import nu.marginalia.service.server.RateLimiter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -47,6 +48,8 @@ public class SearchSiteInfoService {
private final HikariDataSource dataSource;
private final SearchSiteSubscriptionService searchSiteSubscriptions;
private final RateLimiter rateLimiter = RateLimiter.custom(60);
@Inject
public SearchSiteInfoService(SearchOperator searchOperator,
DomainInfoClient domainInfoClient,
@@ -238,6 +241,7 @@ public class SearchSiteInfoService {
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
boolean isSubscribed = searchSiteSubscriptions.isSubscribed(context, domain);
boolean rateLimited = !rateLimiter.isAllowed();
if (domainId < 0) {
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
@@ -250,6 +254,12 @@ public class SearchSiteInfoService {
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
}
else if (rateLimited) {
domainInfoFuture = domainInfoClient.domainInformation(domainId);
similarSetFuture = CompletableFuture.failedFuture(new Exception("Rate limit exceeded"));
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Rate limit exceeded"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Rate limit exceeded"));
}
else {
domainInfoFuture = domainInfoClient.domainInformation(domainId);
similarSetFuture = domainInfoClient.similarDomains(domainId, 25);
@@ -257,7 +267,14 @@ public class SearchSiteInfoService {
feedItemsFuture = feedsClient.getFeed(domainId);
}
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5, 1).results;
List<UrlDetails> sampleResults;
if (rateLimited) {
sampleResults = List.of();
}
else {
sampleResults = searchOperator.doSiteSearch(domainName, domainId, 5, 1).results;
}
if (!sampleResults.isEmpty()) {
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
}
@@ -276,8 +293,9 @@ public class SearchSiteInfoService {
sampleResults
);
requestMissingScreenshots(result);
if (!rateLimited) {
requestMissingScreenshots(result);
}
return result;
}

View File

@@ -0,0 +1,11 @@
@param String message
<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8">
<title>Unavailable</title></head>
<body>
<h1>Service Overloaded</h1>
<p>${message}</p>
</body>
</html>

View File

@@ -69,6 +69,7 @@ include 'code:processes:crawling-process:ft-link-parser'
include 'code:processes:crawling-process:ft-content-type'
include 'code:processes:live-crawling-process'
include 'code:processes:ping-process'
include 'code:processes:new-domain-process'
include 'code:processes:process-mq-api'