1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

3 Commits

Author SHA1 Message Date
Viktor Lofgren
1adf4835fa (ping) Add schema change information to domain security events
Particularly the HTTPS->HTTP-change event appears to be a strong indicator of domain parking.
2025-06-15 16:47:49 +02:00
Viktor Lofgren
b7b5d0bf46 (ping) More accurately detect connection errors 2025-06-15 16:47:07 +02:00
Viktor Lofgren
416059adde (ping) Avoid thread starvation scenario in job scheduling
Adjust the queueing strategy to avoid thread starvation from whale domains with many subdomains all locking on the same semaphore and gunking up all threads by implementing a mechanism that returns jobs that can't be executed to the queue.

This will lead to some queue churn, but it should be fairly manageable given the small number of threads involved, and the fairly long job execution times.
2025-06-15 11:04:34 +02:00
13 changed files with 167 additions and 28 deletions

View File

@@ -0,0 +1,5 @@
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
-- to make it easier to make sense of certificate changes
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NO_CHANGE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;

View File

@@ -1,5 +1,6 @@
package nu.marginalia.coordination;
import com.google.inject.Singleton;
import nu.marginalia.model.EdgeDomain;
import java.time.Duration;
@@ -9,6 +10,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
@Singleton
public class LocalDomainCoordinator implements DomainCoordinator {
// The locks are stored in a map, with the domain name as the key. This map will grow
// relatively big, but should be manageable since the number of domains is limited to

View File

@@ -1,6 +1,7 @@
package nu.marginalia.coordination;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.service.discovery.ServiceRegistryIf;
@@ -13,11 +14,14 @@ import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
@Singleton
public class ZookeeperDomainCoordinator implements DomainCoordinator {
// The locks are stored in a map, with the domain name as the key. This map will grow
// relatively big, but should be manageable since the number of domains is limited to
// a few hundred thousand typically.
private final Map<String, InterProcessSemaphoreV2> locks = new ConcurrentHashMap<>();
private final Map<String, Integer> waitCounts = new ConcurrentHashMap<>();
private final ServiceRegistryIf serviceRegistry;
private final int nodeId;
@@ -32,27 +36,35 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
*/
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::createSemapore);
final String key = domain.topDomain.toLowerCase();
var sem = locks.computeIfAbsent(key, this::createSemapore);
// Increment or add a wait count for the domain
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
try {
var lease = sem.acquire();
return new ZkDomainLock(sem, lease);
return new ZkDomainLock(sem, sem.acquire());
}
catch (Exception e) {
throw new RuntimeException("Failed to acquire lock for domain: " + domain.topDomain, e);
}
finally {
// Decrement or remove the wait count for the domain
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
}
}
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException {
return tryLockDomain(domain, Duration.ofSeconds(1)); // Underlying semaphore doesn't have a tryLock method, so we use a short timeout
}
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
final String key = domain.topDomain.toLowerCase();
var sem = locks.computeIfAbsent(key, this::createSemapore);
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::createSemapore);
// Increment or add a wait count for the domain
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
try {
var lease = sem.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); // Acquire with timeout
if (lease != null) {
@@ -65,6 +77,9 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
catch (Exception e) {
return Optional.empty(); // If we fail to acquire the lock, we return an empty optional
}
finally {
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
}
}
private InterProcessSemaphoreV2 createSemapore(String topDomain){
@@ -81,7 +96,7 @@ public class ZookeeperDomainCoordinator implements DomainCoordinator {
* after this method returns true)
*/
public boolean isLockableHint(EdgeDomain domain) {
return true; // Curator does not provide a way to check if a lock is available without acquiring it
return !waitCounts.containsKey(domain.topDomain.toLowerCase());
}
public static class ZkDomainLock implements DomainLock {

View File

@@ -185,12 +185,12 @@ public class PingDao {
return null;
}
public List<UpdateSchedule.UpdateJob<Long, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
List<UpdateSchedule.UpdateJob<Long, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
public List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
try (var conn = dataSource.getConnection();
var ps = conn.prepareStatement("""
SELECT ID, NEXT_SCHEDULED_UPDATE
SELECT ID, DOMAIN_NAME, NEXT_SCHEDULED_UPDATE
FROM EC_DOMAIN
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION
ON EC_DOMAIN.ID = DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID
@@ -200,11 +200,13 @@ public class PingDao {
ps.setInt(1, nodeId);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
long domainId = rs.getLong("ID");
int domainId = rs.getInt("ID");
String domainName = rs.getString("DOMAIN_NAME");
var ts = rs.getTimestamp("NEXT_SCHEDULED_UPDATE");
Instant nextUpdate = ts == null ? Instant.now() : ts.toInstant();
updateJobs.add(new UpdateSchedule.UpdateJob<>(domainId, nextUpdate));
var ref = new DomainReference(domainId, nodeId, domainName.toLowerCase());
updateJobs.add(new UpdateSchedule.UpdateJob<>(ref, nextUpdate));
}
} catch (SQLException e) {
throw new RuntimeException("Failed to retrieve domain update schedule", e);
@@ -241,7 +243,7 @@ public class PingDao {
else {
var record = new DomainDnsRecord(rs);
updateJobs.add(new UpdateSchedule.UpdateJob<>(
new RootDomainReference.ById(dnsRootDomainId),
new RootDomainReference.ByIdAndName(dnsRootDomainId, rootDomainName),
Objects.requireNonNullElseGet(record.tsNextScheduledUpdate(), Instant::now))
);
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.ping;
import com.google.inject.Inject;
import nu.marginalia.coordination.DomainCoordinator;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.ping.model.*;
import nu.marginalia.ping.svc.DnsPingService;
import nu.marginalia.ping.svc.HttpPingService;
@@ -31,7 +32,7 @@ public class PingJobScheduler {
private static final UpdateSchedule<RootDomainReference, RootDomainReference> dnsUpdateSchedule
= new UpdateSchedule<>(250_000);
private static final UpdateSchedule<Long, HistoricalAvailabilityData> availabilityUpdateSchedule
private static final UpdateSchedule<DomainReference, HistoricalAvailabilityData> availabilityUpdateSchedule
= new UpdateSchedule<>(250_000);
public volatile Instant dnsLastSync = Instant.now();
@@ -138,7 +139,15 @@ public class PingJobScheduler {
continue;
}
long nextId = availabilityUpdateSchedule.next();
DomainReference ref = availabilityUpdateSchedule.nextIf(domain -> {
EdgeDomain domainObj = new EdgeDomain(domain.domainName());
if (!domainCoordinator.isLockableHint(domainObj)) {
return false; // Skip locked domains
}
return true; // Process this domain
});
long nextId = ref.domainId();
var data = pingDao.getHistoricalAvailabilityData(nextId);
if (data == null) {
logger.warn("No availability data found for ID: {}", nextId);
@@ -163,7 +172,7 @@ public class PingJobScheduler {
for (var object : objects) {
var ts = object.nextUpdateTime();
if (ts != null) {
availabilityUpdateSchedule.add(nextId, ts);
availabilityUpdateSchedule.add(ref, ts);
break;
}
}
@@ -194,7 +203,7 @@ public class PingJobScheduler {
try {
List<WritableModel> objects = switch(ref) {
case RootDomainReference.ById(long id) -> {
case RootDomainReference.ByIdAndName(long id, String name) -> {
var oldRecord = Objects.requireNonNull(pingDao.getDomainDnsRecord(id));
yield dnsPingService.pingDomain(oldRecord.rootDomainName(), oldRecord);
}

View File

@@ -2,9 +2,8 @@ package nu.marginalia.ping;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.Comparator;
import java.util.PriorityQueue;
import java.util.*;
import java.util.function.Predicate;
/** In-memory schedule for updates, allowing jobs to be added and processed in order of their scheduled time.
* This is not a particularly high-performance implementation, but exists to take contention off the database's
@@ -23,6 +22,9 @@ public class UpdateSchedule<T, T2> {
notifyAll();
}
/** Returns the next job in the queue that is due to be processed.
* If no jobs are due, it will block until a job is added or a job becomes due.
* */
public synchronized T next() throws InterruptedException {
while (true) {
if (updateQueue.isEmpty()) {
@@ -44,6 +46,56 @@ public class UpdateSchedule<T, T2> {
}
}
/** Returns the first job in the queue matching the predicate that is not scheduled into the future,
* blocking until a job is added or a job becomes due.
*/
public synchronized T nextIf(Predicate<T> predicate) throws InterruptedException {
List<UpdateJob<T, T2>> rejectedJobs = new ArrayList<>();
try {
while (true) {
if (updateQueue.isEmpty()) {
wait(); // Wait for a new job to be added
continue;
}
UpdateJob<T, T2> job = updateQueue.peek();
Instant now = Instant.now();
if (job.updateTime.isAfter(now)) {
Duration toWait = Duration.between(now, job.updateTime);
// Return the rejected jobs to the queue for other threads to process
updateQueue.addAll(rejectedJobs);
if (!rejectedJobs.isEmpty())
notifyAll();
rejectedJobs.clear();
wait(Math.max(1, toWait.toMillis()));
} else {
var candidate = updateQueue.poll(); // Remove the job from the queue since it's due
assert candidate != null : "Update job should not be null at this point, since we just peeked it in a synchronized block";
if (!predicate.test(candidate.key())) {
rejectedJobs.add(candidate);
}
else {
return candidate.key();
}
}
}
}
finally {
// Return the rejected jobs to the queue for other threads to process
updateQueue.addAll(rejectedJobs);
if (!rejectedJobs.isEmpty())
notifyAll();
}
}
public synchronized void clear() {
updateQueue.clear();
notifyAll();

View File

@@ -4,6 +4,7 @@ import com.google.inject.Inject;
import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome;
import nu.marginalia.ping.fetcher.response.*;
import org.apache.hc.client5.http.HttpHostConnectException;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.protocol.HttpClientContext;
import org.apache.hc.core5.http.Header;
@@ -82,9 +83,12 @@ public class PingHttpFetcher {
});
} catch (SocketTimeoutException ex) {
return new TimeoutResponse(ex.getMessage());
} catch (IOException e) {
} catch (HttpHostConnectException e) {
return new ConnectionError(e.getClass().getSimpleName());
} catch (IOException e) {
return new ProtocolError(e.getClass().getSimpleName());
}
}
}

View File

@@ -18,6 +18,7 @@ public record DomainSecurityEvent(
boolean certificatePublicKeyChanged,
boolean certificateSerialNumberChanged,
boolean certificateIssuerChanged,
SchemaChange schemaChange,
Duration oldCertificateTimeToExpiry,
boolean securityHeadersChanged,
boolean ipChanged,
@@ -45,8 +46,9 @@ public record DomainSecurityEvent(
security_signature_before,
security_signature_after,
change_certificate_serial_number,
change_certificate_issuer
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
change_certificate_issuer,
change_schema
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
"""))
{
@@ -81,6 +83,7 @@ public record DomainSecurityEvent(
ps.setBoolean(15, certificateSerialNumberChanged());
ps.setBoolean(16, certificateIssuerChanged());
ps.setString(17, schemaChange.name());
ps.executeUpdate();
}

View File

@@ -1,6 +1,6 @@
package nu.marginalia.ping.model;
public sealed interface RootDomainReference {
record ById(long id) implements RootDomainReference { }
record ByIdAndName(long id, String name) implements RootDomainReference { }
record ByName(String name) implements RootDomainReference { }
}

View File

@@ -0,0 +1,12 @@
package nu.marginalia.ping.model;
public enum SchemaChange {
UNKNOWN,
NO_CHANGE,
HTTP_TO_HTTPS,
HTTPS_TO_HTTP;
public boolean isSignificant() {
return this != NO_CHANGE && this != UNKNOWN;
}
}

View File

@@ -2,6 +2,9 @@ package nu.marginalia.ping.model.comparison;
import nu.marginalia.ping.model.DomainAvailabilityRecord;
import nu.marginalia.ping.model.DomainSecurityRecord;
import nu.marginalia.ping.model.HttpSchema;
import nu.marginalia.ping.model.SchemaChange;
import org.jetbrains.annotations.NotNull;
import java.time.Duration;
import java.time.Instant;
@@ -20,7 +23,8 @@ public record SecurityInformationChange(
Duration oldCertificateTimeToExpiry,
boolean isSecurityHeadersChanged,
boolean isIpAddressChanged,
boolean isSoftwareHeaderChanged
boolean isSoftwareHeaderChanged,
SchemaChange schemaChange
) {
public static SecurityInformationChange between(
DomainSecurityRecord before, DomainAvailabilityRecord availabilityBefore,
@@ -43,9 +47,10 @@ public record SecurityInformationChange(
);
boolean securityHeadersChanged = before.securityHeadersHash() != after.securityHeadersHash();
boolean softwareChanged = !Objects.equals(before.headerServer(), after.headerServer());
SchemaChange schemaChange = getSchemaChange(before, after);
// Note we don't include IP address changes in the overall change status,
// as this is not alone considered a change in security information; we may have
// multiple IP addresses for a domain, and the IP address may change frequently
@@ -55,7 +60,8 @@ public record SecurityInformationChange(
|| certificateFingerprintChanged
|| securityHeadersChanged
|| certificateProfileChanged
|| softwareChanged;
|| softwareChanged
|| schemaChange.isSignificant();
return new SecurityInformationChange(
isChanged,
@@ -69,9 +75,36 @@ public record SecurityInformationChange(
oldCertificateTimeToExpiry,
securityHeadersChanged,
ipChanged,
softwareChanged
softwareChanged,
schemaChange
);
}
private static @NotNull SchemaChange getSchemaChange(DomainSecurityRecord before, DomainSecurityRecord after) {
if (before.httpSchema() == null || after.httpSchema() == null) {
return SchemaChange.UNKNOWN;
}
boolean beforeIsHttp = before.httpSchema() == HttpSchema.HTTP;
boolean afterIsHttp = after.httpSchema() == HttpSchema.HTTP;
boolean beforeIsHttps = before.httpSchema() == HttpSchema.HTTPS;
boolean afterIsHttps = after.httpSchema() == HttpSchema.HTTPS;
SchemaChange schemaChange;
if (beforeIsHttp && afterIsHttp) {
schemaChange = SchemaChange.NO_CHANGE;
} else if (beforeIsHttps && afterIsHttps) {
schemaChange = SchemaChange.NO_CHANGE;
} else if (beforeIsHttp && afterIsHttps) {
schemaChange = SchemaChange.HTTP_TO_HTTPS;
} else if (beforeIsHttps && afterIsHttp) {
schemaChange = SchemaChange.HTTPS_TO_HTTP;
} else {
schemaChange = SchemaChange.UNKNOWN;
}
return schemaChange;
}
}

View File

@@ -296,6 +296,7 @@ public class HttpPingService {
change.isCertificatePublicKeyChanged(),
change.isCertificateSerialNumberChanged(),
change.isCertificateIssuerChanged(),
change.schemaChange(),
change.oldCertificateTimeToExpiry(),
change.isSecurityHeadersChanged(),
change.isIpAddressChanged(),

View File

@@ -320,6 +320,7 @@ class PingDaoTest {
true,
true,
false,
SchemaChange.NO_CHANGE,
Duration.ofDays(30),
false,
false,