mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
78 Commits
deploy-021
...
deploy-026
Author | SHA1 | Date | |
---|---|---|---|
|
294ab19177 | ||
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 |
@@ -1,3 +1,8 @@
|
||||
package nu.marginalia;
|
||||
|
||||
/**
|
||||
* A record representing a User Agent.
|
||||
* @param uaString - the header value of the User Agent
|
||||
* @param uaIdentifier - what we look for in robots.txt
|
||||
*/
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(2, config.acceptQueries());
|
||||
us.setBoolean(3, config.autoClean());
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
us.setBoolean(5, config.autoAssignDomains());
|
||||
us.setBoolean(6, config.keepWarcs());
|
||||
us.setBoolean(7, config.disabled());
|
||||
us.setString(8, config.profile().name());
|
||||
us.setInt(9, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
||||
boolean acceptQueries,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean autoAssignDomains,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
return isBatchCrawl() || isMixed();
|
||||
}
|
||||
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
||||
assertEquals(2, list.size());
|
||||
assertEquals(a, list.get(0));
|
||||
assertEquals(b, list.get(1));
|
||||
}
|
||||
|
||||
|
||||
// Test all the fields that are only exposed via save()
|
||||
@Test
|
||||
public void testSaveChanges() throws SQLException {
|
||||
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, original.node());
|
||||
assertEquals("Test", original.description());
|
||||
assertFalse(original.acceptQueries());
|
||||
|
||||
var precession = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
!original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(precession);
|
||||
precession = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||
|
||||
var autoClean = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
!original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(autoClean);
|
||||
autoClean = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||
|
||||
var disabled = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
autoClean.autoClean(),
|
||||
autoClean.includeInPrecession(),
|
||||
autoClean.autoAssignDomains(),
|
||||
autoClean.keepWarcs(),
|
||||
autoClean.profile(),
|
||||
!autoClean.disabled()
|
||||
);
|
||||
nodeConfigurationService.save(disabled);
|
||||
disabled = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||
}
|
||||
}
|
@@ -0,0 +1,213 @@
|
||||
|
||||
-- Create metadata tables for domain ping status and security information
|
||||
|
||||
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||
-- indicating ownership changes or security issues.
|
||||
|
||||
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||
|
||||
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||
|
||||
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||
|
||||
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||
|
||||
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||
|
||||
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||
|
||||
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||
|
||||
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||
|
||||
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||
|
||||
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||
|
||||
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||
|
||||
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||
|
||||
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||
|
||||
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||
|
||||
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||
|
||||
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||
|
||||
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||
|
||||
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- DNS change type flags
|
||||
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||
|
||||
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||
|
||||
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,7 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||
-- to make it easier to get more information about the SSL certificate's validity
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
||||
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||
-- be crawled.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||
PRIORITY INT NOT NULL DEFAULT 0,
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
||||
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||
|
||||
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||
return Instant.ofEpochMilli(json.getAsLong());
|
||||
} else {
|
||||
throw new JsonParseException("Expected a number for Instant");
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
public class ProcessEventLog {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||
|
||||
private final String serviceName;
|
||||
private final UUID instanceUuid;
|
||||
private final String serviceBase;
|
||||
|
||||
@Inject
|
||||
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||
this.instanceUuid = configuration.instanceUuid();
|
||||
this.serviceBase = configuration.processName();
|
||||
|
||||
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||
|
||||
logEvent("PCS-START", serviceName);
|
||||
}
|
||||
|
||||
public void logEvent(Class<?> type, String message) {
|
||||
logEvent(type.getSimpleName(), message);
|
||||
}
|
||||
public void logEvent(String type, String message) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUuid.toString());
|
||||
stmt.setString(4, type);
|
||||
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to log event {}:{}", type, message);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,17 +1,21 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
* be discovered by other services on the network.
|
||||
*/
|
||||
@ImplementedBy(ZkServiceRegistry.class)
|
||||
public interface ServiceRegistryIf {
|
||||
/**
|
||||
* Register a service with the registry.
|
||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
||||
* </ul>
|
||||
* */
|
||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
|
||||
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.Watcher;
|
||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
.forPath("/running-instances");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(path);
|
||||
livenessPaths.add(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deregisterProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.delete().forPath(path);
|
||||
livenessPaths.remove(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||
if (stopped)
|
||||
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||
|
||||
String path = "/semaphores/" + name;
|
||||
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
public synchronized void shutDown() {
|
||||
if (stopped)
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
private final MqPersistence persistence;
|
||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||
private final ServiceRegistryIf registry;
|
||||
|
||||
@Inject
|
||||
public ExecutorClient(ServiceRegistryIf registry,
|
||||
MqPersistence persistence,
|
||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
{
|
||||
this.registry = registry;
|
||||
this.persistence = persistence;
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void startFsm(int node, String actorName) {
|
||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||
.forNode(node)
|
||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public long updateNsfwFilters() throws Exception {
|
||||
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||
|
||||
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||
.forNode(1)
|
||||
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public ActorRunStates getActorStates(int node) {
|
||||
try {
|
||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
@@ -66,6 +68,9 @@ message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcUpdateNsfwFilters {
|
||||
int64 msgId = 1;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
|
@@ -19,6 +19,8 @@ dependencies {
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
@@ -40,7 +42,6 @@ dependencies {
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
@@ -6,13 +6,15 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
@@ -49,6 +49,8 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
NdpMonitorActor ndpMonitorActor,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
@@ -89,9 +91,10 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
|
||||
@@ -110,7 +113,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public NdpMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.NDP_INBOX,
|
||||
ProcessService.ProcessId.NDP);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,181 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||
// but have to reimplement a lot of the same logic ourselves.
|
||||
@Singleton
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final Gson gson;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||
public record Run(int attempts) implements ActorStep {}
|
||||
@Terminal
|
||||
public record Aborted() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest();
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
null);
|
||||
|
||||
yield new Monitor(0);
|
||||
}
|
||||
case Monitor(int errorAttempts) -> {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
|
||||
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||
yield new Monitor(0);
|
||||
}
|
||||
// else continue
|
||||
} else {
|
||||
// Special: Associate this thread with the message so that we can get tracking
|
||||
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||
|
||||
yield new Run(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
case Run(int attempts) -> {
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
var exec = new TaskExecution();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
if (exec.isError()) {
|
||||
if (attempts < MAX_ATTEMPTS)
|
||||
yield new Run(attempts + 1);
|
||||
else
|
||||
yield new Error();
|
||||
}
|
||||
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||
// To avoid boot loops, we transition to error if the process
|
||||
// didn't run for longer than 1 seconds. This might happen if
|
||||
// the process crashes before it can reach the heartbeat and inbox
|
||||
// stages of execution. In this case it would not report having acted
|
||||
// on its message, and the process would be restarted forever without
|
||||
// the attempts counter incrementing.
|
||||
yield new Error("Process terminated within 1 seconds of starting");
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
}
|
||||
|
||||
yield new Monitor(attempts);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessService.ProcessId.PING;
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
* the service respawning on the same task when we
|
||||
* re-enable this actor */
|
||||
private void setCurrentMessageToDead() {
|
||||
try {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||
return;
|
||||
|
||||
var theMessage = messages.iterator().next();
|
||||
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encapsulates the execution of the process in a separate thread so that
|
||||
* we can interrupt the thread if the process is cancelled */
|
||||
private class TaskExecution {
|
||||
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
}
|
||||
}).get(); // Wait for the process to start
|
||||
}
|
||||
|
||||
public boolean isError() {
|
||||
return error.get();
|
||||
}
|
||||
}
|
||||
}
|
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
logger.info("{}", self);
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
yield new Monitor("-");
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@@ -12,23 +14,26 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
public record Initial(long respondMsgId) implements ActorStep {}
|
||||
public record Run(long respondMsgId) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
case Initial(long respondMsgId) -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
yield new Run(respondMsgId);
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
case Run(long respondMsgId) -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
@@ -43,11 +48,13 @@ public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
@@ -263,4 +264,19 @@ public class ExecutorGrpcService
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||
logger.info("Got request {}", request);
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -8,6 +8,8 @@ import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.task.ExportTasksMain;
|
||||
@@ -41,6 +43,7 @@ public class ProcessService {
|
||||
return switch (id) {
|
||||
case "converter" -> ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessId.CRAWLER;
|
||||
case "ping" -> ProcessId.PING;
|
||||
case "loader" -> ProcessId.LOADER;
|
||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||
@@ -50,10 +53,12 @@ public class ProcessService {
|
||||
|
||||
public enum ProcessId {
|
||||
CRAWLER(CrawlerMain.class),
|
||||
PING(PingMain.class),
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
|
||||
@@ -68,6 +73,8 @@ public class ProcessService {
|
||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case NDP -> "NDP_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND SERVER_AVAILABLE
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -59,6 +60,11 @@ public class FeedsClient {
|
||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||
}
|
||||
|
||||
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||
return channelPool.awaitChannel(duration);
|
||||
}
|
||||
|
||||
|
||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||
public String getFeedDataHash() {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||
|
@@ -22,6 +22,7 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
@@ -34,6 +35,7 @@ dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
implementation libs.httpclient
|
||||
implementation libs.wiremock
|
||||
|
||||
implementation libs.prometheus
|
||||
|
@@ -1,66 +0,0 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("fetching:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
||||
}
|
||||
}
|
||||
}
|
@@ -5,6 +5,8 @@ import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.executor.client.ExecutorClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
@@ -18,19 +20,36 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@@ -51,12 +70,15 @@ public class FeedFetcherService {
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
private final ExecutorClient executorClient;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final HttpClient httpClient;
|
||||
|
||||
private volatile boolean updating;
|
||||
|
||||
@Inject
|
||||
public FeedFetcherService(FeedDb feedDb,
|
||||
DomainCoordinator domainCoordinator,
|
||||
FileStorageService fileStorageService,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ServiceHeartbeat serviceHeartbeat,
|
||||
@@ -67,6 +89,84 @@ public class FeedFetcherService {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
httpClient = HttpClients.custom()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.setConnectionManager(connectionManager)
|
||||
.setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setConnectionManager(connectionManager)
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -82,13 +182,7 @@ public class FeedFetcherService {
|
||||
|
||||
|
||||
try (FeedDbWriter writer = feedDb.createWriter();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.connectTimeout(Duration.ofSeconds(15))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||
ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
FeedJournal feedJournal = FeedJournal.create();
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||
) {
|
||||
@@ -132,8 +226,9 @@ public class FeedFetcherService {
|
||||
};
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
}
|
||||
@@ -212,7 +307,6 @@ public class FeedFetcherService {
|
||||
}
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||
HttpClient client,
|
||||
ExecutorService executorService,
|
||||
@Nullable String ifModifiedSinceDate,
|
||||
@Nullable String ifNoneMatchTag)
|
||||
@@ -220,59 +314,63 @@ public class FeedFetcherService {
|
||||
try {
|
||||
URI uri = new URI(feed.feedUrl());
|
||||
|
||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(uri)
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
var requestBuilder = ClassicRequestBuilder.get(uri)
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.setHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
return httpClient.execute(requestBuilder.build(), rsp -> {
|
||||
try {
|
||||
logger.info("Code: {}, URL: {}", rsp.getCode(), uri);
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
switch (rsp.getCode()) {
|
||||
case 200 -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return new FetchResult.TransientError(); // No content to read, treat as transient error
|
||||
}
|
||||
byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
// Decode the response body based on the Content-Type header
|
||||
Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
String contentType = contentTypeHeader.getValue();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||
* its support for timeouts only applies to the time until response starts to be received,
|
||||
* and does not catch the case when the server starts to send data but then hangs.
|
||||
*/
|
||||
HttpResponse<byte[]> rs = executorService.submit(
|
||||
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||
.get(15, TimeUnit.SECONDS);
|
||||
// Grab the ETag header if it exists
|
||||
Header etagHeader = rsp.getFirstHeader("ETag");
|
||||
String newEtagValue = etagHeader == null ? null : etagHeader.getValue();
|
||||
|
||||
if (rs.statusCode() == 429) { // Too Many Requests
|
||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||
continue;
|
||||
}
|
||||
|
||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
||||
|
||||
return switch (rs.statusCode()) {
|
||||
case 200 -> {
|
||||
byte[] responseData = getResponseData(rs);
|
||||
|
||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
||||
return new FetchResult.Success(bodyText, newEtagValue);
|
||||
}
|
||||
case 304 -> {
|
||||
return new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
}
|
||||
case 404 -> {
|
||||
return new FetchResult.PermanentError(); // never try again
|
||||
}
|
||||
default -> {
|
||||
return new FetchResult.TransientError(); // we try again later
|
||||
}
|
||||
}
|
||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
||||
default -> new FetchResult.TransientError(); // we try again later
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new FetchResult.PermanentError(); // treat as permanent error
|
||||
}
|
||||
finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.debug("Error fetching feed", ex);
|
||||
@@ -281,19 +379,6 @@ public class FeedFetcherService {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
public sealed interface FetchResult {
|
||||
record Success(String value, String etag) implements FetchResult {}
|
||||
record NotModified() implements FetchResult {}
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.name.Names;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.rss.db.FeedDb;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
@@ -82,6 +84,7 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
||||
|
32
code/libraries/domain-lock/build.gradle
Normal file
32
code/libraries/domain-lock/build.gradle
Normal file
@@ -0,0 +1,32 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public class DefaultDomainPermits {
|
||||
|
||||
public static int defaultPermits(EdgeDomain domain) {
|
||||
return defaultPermits(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static int defaultPermits(String topDomain) {
|
||||
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return 16;
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return 8;
|
||||
if (topDomain.equals("github.io"))
|
||||
return 8;
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,17 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class DomainCoordinationModule extends AbstractModule {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainCoordinationModule.class);
|
||||
|
||||
public DomainCoordinationModule() {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(ZookeeperDomainCoordinator.class);
|
||||
}
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Optional;
|
||||
|
||||
public interface DomainCoordinator {
|
||||
DomainLock lockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
boolean isLockableHint(EdgeDomain domain);
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
public interface DomainLock extends AutoCloseable {
|
||||
void close();
|
||||
}
|
@@ -1,16 +1,17 @@
|
||||
package nu.marginalia.crawl.logic;
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
@Singleton
|
||||
public class LocalDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
@@ -24,13 +25,25 @@ public class DomainLocks {
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
return new LocalDomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1, timeout.toMillis(), TimeUnit.MILLISECONDS)) {
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
@@ -39,24 +52,7 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
return new Semaphore(DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
@@ -71,15 +67,15 @@ public class DomainLocks {
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
public static class LocalDomainLock implements DomainLock {
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(Semaphore semaphore) {
|
||||
LocalDomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
@@ -0,0 +1,116 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.framework.recipes.locks.Lease;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, InterProcessSemaphoreV2> locks = new ConcurrentHashMap<>();
|
||||
private final Map<String, Integer> waitCounts = new ConcurrentHashMap<>();
|
||||
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final int nodeId;
|
||||
|
||||
@Inject
|
||||
public ZookeeperDomainCoordinator(ServiceRegistryIf serviceRegistry, @Named("wmsa-system-node") int nodeId) {
|
||||
// Zookeeper-specific initialization can be done here if needed
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.nodeId = nodeId;
|
||||
}
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
return new ZkDomainLock(sem, sem.acquire());
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to acquire lock for domain: " + domain.topDomain, e);
|
||||
}
|
||||
finally {
|
||||
// Decrement or remove the wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return tryLockDomain(domain, Duration.ofSeconds(1)); // Underlying semaphore doesn't have a tryLock method, so we use a short timeout
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
var lease = sem.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); // Acquire with timeout
|
||||
if (lease != null) {
|
||||
return Optional.of(new ZkDomainLock(sem, lease));
|
||||
}
|
||||
else {
|
||||
return Optional.empty(); // If we fail to acquire the lease, we return an empty optional
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
return Optional.empty(); // If we fail to acquire the lock, we return an empty optional
|
||||
}
|
||||
finally {
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private InterProcessSemaphoreV2 createSemapore(String topDomain){
|
||||
try {
|
||||
return serviceRegistry.getSemaphore(topDomain + ":" + nodeId, DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to get semaphore for domain: " + topDomain, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
return !waitCounts.containsKey(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static class ZkDomainLock implements DomainLock {
|
||||
private final InterProcessSemaphoreV2 semaphore;
|
||||
private final Lease lease;
|
||||
|
||||
ZkDomainLock(InterProcessSemaphoreV2 semaphore, Lease lease) {
|
||||
this.semaphore = semaphore;
|
||||
this.lease = lease;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.returnLease(lease);
|
||||
}
|
||||
}
|
||||
}
|
@@ -15,6 +15,10 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.sources.AsnMapping;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
@@ -10,6 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpDictionary {
|
||||
private volatile IP2LocationMapping ip2locMapping = null;
|
||||
private volatile AsnTable asnTable = null;
|
||||
@@ -76,7 +78,7 @@ public class GeoIpDictionary {
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(int ipAddress) {
|
||||
if (null == asnTable) { // not loaded yet or failed to load
|
||||
if (null == asnMapping || null == asnTable) { // not loaded yet or failed to load
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
||||
@@ -58,6 +59,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
@@ -10,9 +10,11 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
@@ -25,9 +27,12 @@ import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessEventLog;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -54,6 +59,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final ProcessEventLog eventLog;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@@ -61,9 +67,10 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final HikariDataSource dataSource;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final int node;
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -84,6 +91,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
HttpFetcherImpl httpFetcher,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
ProcessEventLog eventLog,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
@@ -91,6 +99,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
WarcArchiverFactory warcArchiverFactory,
|
||||
HikariDataSource dataSource,
|
||||
DomainBlacklist blacklist,
|
||||
DomainCoordinator domainCoordinator,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
Gson gson) throws InterruptedException {
|
||||
|
||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||
@@ -98,6 +108,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.userAgent = userAgent;
|
||||
this.fetcher = httpFetcher;
|
||||
this.heartbeat = heartbeat;
|
||||
this.eventLog = eventLog;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@@ -105,6 +116,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
@@ -147,12 +160,18 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new CrawlerModule(),
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
var crawler = injector.getInstance(CrawlerMain.class);
|
||||
|
||||
var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||
|
||||
crawler.serviceRegistry.registerProcess("crawler", crawler.node);
|
||||
|
||||
try {
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawling started");
|
||||
var req = instructions.value();
|
||||
if (req.targetDomainName != null) {
|
||||
crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage);
|
||||
@@ -160,11 +179,15 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
else {
|
||||
crawler.runForDatabaseDomains(req.crawlStorage);
|
||||
}
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawl completed successfully");
|
||||
instructions.ok();
|
||||
} catch (Exception ex) {
|
||||
logger.error("Crawler failed", ex);
|
||||
instructions.err();
|
||||
}
|
||||
finally {
|
||||
crawler.serviceRegistry.deregisterProcess("crawler", crawler.node);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(5);
|
||||
}
|
||||
@@ -433,7 +456,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
return domainCoordinator.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -444,7 +467,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
Optional<DomainLock> lock = domainCoordinator.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
@@ -452,7 +475,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
DomainLock domainLock = lock.get();
|
||||
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
@@ -52,6 +52,7 @@ import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
@@ -87,13 +88,14 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
@@ -183,6 +185,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
@@ -193,6 +197,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
|
@@ -115,9 +115,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
if (!robotsRules.isAllowed(probedUrl.toString())) {
|
||||
warcRecorder.flagAsRobotsTxtError(probedUrl);
|
||||
yield 1; // Nothing we can do here, we aren't allowed to fetch the root URL
|
||||
}
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, robotsRules, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
@@ -270,11 +274,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, SimpleRobotRules robotsRules, CrawlDelayTimer timer) {
|
||||
Optional<String> feedLink = Optional.empty();
|
||||
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
EdgeUrl url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
@@ -331,7 +335,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
if (feedLink.isEmpty()) {
|
||||
feedLink = guessFeedUrl(timer);
|
||||
feedLink = guessFeedUrl(timer, robotsRules);
|
||||
}
|
||||
|
||||
// Download the sitemap if available
|
||||
@@ -339,14 +343,18 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
if (robotsRules.isAllowed(faviconUrl.toString())) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED)
|
||||
instanceof HttpFetchResult.ResultOk iconResult)
|
||||
{
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
}
|
||||
}
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
@@ -383,7 +391,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
"blog/rss"
|
||||
);
|
||||
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer, SimpleRobotRules robotsRules) throws InterruptedException {
|
||||
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||
|
||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||
@@ -396,6 +404,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
for (String endpoint : likelyFeedEndpoints) {
|
||||
String url = "https://" + domain + "/" + endpoint;
|
||||
if (!robotsRules.isAllowed(url)) {
|
||||
continue;
|
||||
}
|
||||
if (validateFeedUrl(url, timer)) {
|
||||
return Optional.of(url);
|
||||
}
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
@@ -49,6 +50,7 @@ dependencies {
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation libs.httpclient
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
@@ -10,9 +10,12 @@ import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||
import nu.marginalia.loading.documents.KeywordLoaderService;
|
||||
@@ -30,12 +33,15 @@ import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.Security;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.HashMap;
|
||||
@@ -58,6 +64,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
@@ -71,7 +78,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
DomainProcessor domainProcessor,
|
||||
FileStorageService fileStorageService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
|
||||
DocumentLoaderService documentLoaderService,
|
||||
DomainCoordinator domainCoordinator,
|
||||
HikariDataSource dataSource)
|
||||
throws Exception
|
||||
{
|
||||
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||
@@ -84,6 +93,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.keywordLoaderService = keywordLoaderService;
|
||||
this.documentLoaderService = documentLoaderService;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
@@ -107,6 +117,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
try {
|
||||
Injector injector = Guice.createInjector(
|
||||
new LiveCrawlerModule(),
|
||||
new DomainCoordinationModule(), // 2 hours lease timeout is enough for the live crawler
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ConverterModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
@@ -143,7 +154,10 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
private void run() throws Exception {
|
||||
Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data");
|
||||
Path basePath = fileStorageService
|
||||
.getStorageBase(FileStorageBaseType.STORAGE)
|
||||
.asPath()
|
||||
.resolve("live-crawl-data");
|
||||
|
||||
if (!Files.isDirectory(basePath)) {
|
||||
Files.createDirectories(basePath);
|
||||
@@ -158,21 +172,38 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
{
|
||||
final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS);
|
||||
|
||||
/* ------------------------------------------------ */
|
||||
/* Fetch the latest domains from the feeds database */
|
||||
/* ------------------------------------------------ */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.FETCH_LINKS);
|
||||
|
||||
Map<String, List<String>> urlsPerDomain = new HashMap<>(10_000);
|
||||
if (!feedsClient.waitReady(Duration.ofHours(1))) {
|
||||
throw new RuntimeException("Feeds client never became ready, cannot proceed with live crawling");
|
||||
}
|
||||
feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put);
|
||||
|
||||
logger.info("Fetched data for {} domains", urlsPerDomain.size());
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Prune the database from old entries */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PRUNE_DB);
|
||||
|
||||
// Remove data that is too old
|
||||
dataSet.prune(cutoff);
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Fetch the links for each domain */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.CRAWLING);
|
||||
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainQueries, domainBlacklist);
|
||||
CloseableHttpClient client = HttpClientProvider.createClient();
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, client, domainBlacklist);
|
||||
var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
|
||||
{
|
||||
for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
|
||||
@@ -185,18 +216,29 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
fetcher.scheduleRetrieval(domain, urls);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
client.close(CloseMode.GRACEFUL);
|
||||
}
|
||||
|
||||
Path tempPath = dataSet.createWorkDir();
|
||||
|
||||
|
||||
try {
|
||||
/* ------------------------------------- */
|
||||
/* Process the fetched links */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PROCESSING);
|
||||
|
||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing");
|
||||
var writer = new ConverterBatchWriter(tempPath, 0)
|
||||
) {
|
||||
// Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the
|
||||
// main indexes (the maximum permissible for doc ordinal is value is 67_108_863, so this
|
||||
// leaves us with a lot of headroom still)
|
||||
// We need unique document ids that do not collide with the document id from the main index,
|
||||
// so we offset the documents' ordinals toward the upper range.
|
||||
//
|
||||
// The maximum permissible for doc ordinal is value is 67_108_863,
|
||||
// so this leaves us with a lot of headroom still!
|
||||
// Expected document count here is order of 10 :^)
|
||||
writer.setOrdinalOffset(67_000_000);
|
||||
|
||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||
@@ -204,10 +246,15 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ---------------------------------------------- */
|
||||
/* Load the processed data into the link database */
|
||||
/* and construct an index journal for the docs */
|
||||
/* ---------------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.LOADING);
|
||||
|
||||
LoaderInputData lid = new LoaderInputData(tempPath, 1);
|
||||
|
||||
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
|
||||
|
||||
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
|
||||
@@ -219,9 +266,16 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
FileUtils.deleteDirectory(tempPath.toFile());
|
||||
}
|
||||
|
||||
// Construct the index
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Finish up */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.DONE);
|
||||
|
||||
// After we return from here, the LiveCrawlActor will trigger an index construction
|
||||
// job. Unlike all the stuff we did in this process, it's identical to the real job
|
||||
// so we don't need to do anything special from this process
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -5,8 +5,8 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
@@ -14,24 +14,21 @@ import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||
@@ -44,18 +41,21 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
private final LiveCrawlDataSet dataSet;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
private final HttpClient httpClient;
|
||||
|
||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||
DomainCoordinator domainCoordinator,
|
||||
DbDomainQueries domainQueries,
|
||||
HttpClient httpClient,
|
||||
DomainBlacklist domainBlacklist) {
|
||||
this.dataSet = dataSet;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.domainQueries = domainQueries;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.httpClient = httpClient;
|
||||
}
|
||||
|
||||
public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
|
||||
@@ -72,17 +72,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>();
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>(Math.max(1, urls.size()));
|
||||
|
||||
// Resolve absolute URLs
|
||||
for (var url : urls) {
|
||||
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||
if (optParsedUrl.isEmpty()) {
|
||||
|
||||
if (optParsedUrl.isEmpty())
|
||||
continue;
|
||||
}
|
||||
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||
continue;
|
||||
}
|
||||
relevantUrls.add(optParsedUrl.get());
|
||||
|
||||
EdgeUrl absoluteUrl = optParsedUrl.get();
|
||||
|
||||
if (!dataSet.hasUrl(absoluteUrl))
|
||||
relevantUrls.add(absoluteUrl);
|
||||
}
|
||||
|
||||
if (relevantUrls.isEmpty()) {
|
||||
@@ -91,16 +93,10 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
int fetched = 0;
|
||||
|
||||
try (HttpClient client = HttpClient
|
||||
.newBuilder()
|
||||
.connectTimeout(connectTimeout)
|
||||
.followRedirects(HttpClient.Redirect.NEVER)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain)
|
||||
try (// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLock lock = domainCoordinator.lockDomain(domain)
|
||||
) {
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl);
|
||||
|
||||
if (rules == null) { // I/O error fetching robots.txt
|
||||
// If we can't fetch the robots.txt,
|
||||
@@ -113,18 +109,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
||||
|
||||
for (var parsedUrl : relevantUrls) {
|
||||
|
||||
if (!rules.isAllowed(parsedUrl.toString())) {
|
||||
maybeFlagAsBad(parsedUrl);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||
switch (fetchUrl(domainId, parsedUrl, timer)) {
|
||||
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
|
||||
dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||
fetched++;
|
||||
}
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> {
|
||||
maybeFlagAsBad(docUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -147,111 +144,107 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
|
||||
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept-Encoding","gzip")
|
||||
.timeout(readTimeout);
|
||||
|
||||
// Fetch the robots.txt
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl) throws URISyntaxException {
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
if (robotsTxt.statusCode() == 200) {
|
||||
return parser.parseContent(rootUrl.toString(),
|
||||
getResponseData(robotsTxt),
|
||||
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||
WmsaHome.getUserAgent().uaIdentifier());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
var contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return null; // No content type header, can't parse
|
||||
}
|
||||
return new SimpleRobotRulesParser().parseContent(
|
||||
rootUrl.toString(),
|
||||
EntityUtils.toByteArray(rsp.getEntity()),
|
||||
contentTypeHeader.getValue(),
|
||||
WmsaHome.getUserAgent().uaIdentifier()
|
||||
);
|
||||
} else if (rsp.getCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
}
|
||||
} finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
return null;
|
||||
});
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching robots.txt for {}: {}", rootUrl, e.getMessage());
|
||||
return null; // I/O error fetching robots.txt
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
else if (robotsTxt.statusCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Fetch a URL and store it in the database
|
||||
*/
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer) throws Exception {
|
||||
|
||||
timer.waitFetchDelay();
|
||||
|
||||
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept", "text/html")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(readTimeout)
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(parsedUrl.asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept", "text/html")
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
String contentType = rsp.getFirstHeader("Content-Type").getValue();
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
// Handle rate limiting by waiting and retrying once
|
||||
if (response.statusCode() == 429) {
|
||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||
response.headers().firstValue("Retry-After").orElse("5")
|
||||
));
|
||||
response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
}
|
||||
byte[] body = EntityUtils.toByteArray(rsp.getEntity(), MAX_SIZE);
|
||||
|
||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
if (response.statusCode() == 200) {
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
for (var header : rsp.getHeaders()) {
|
||||
headersStr.append(header.getName()).append(": ").append(header.getValue()).append("\n");
|
||||
}
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersStr.toString());
|
||||
}
|
||||
} finally {
|
||||
if (rsp.getEntity() != null) {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
}
|
||||
|
||||
byte[] body = getResponseData(response);
|
||||
if (body.length > MAX_SIZE) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
|
||||
}
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
});
|
||||
}
|
||||
catch (IOException ex) {
|
||||
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
||||
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching {}: {}", parsedUrl, e.getMessage());
|
||||
// If we can't fetch the URL, we return an error result
|
||||
// so that the caller can decide what to do with it.
|
||||
}
|
||||
finally {
|
||||
timer.waitFetchDelay();
|
||||
}
|
||||
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
sealed interface FetchResult {
|
||||
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||
record Error(EdgeUrl url) implements FetchResult {}
|
||||
}
|
||||
|
||||
private String headersToString(HttpHeaders headers) {
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
headers.map().forEach((k, v) -> {
|
||||
headersStr.append(k).append(": ").append(v).append("\n");
|
||||
});
|
||||
return headersStr.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
pool.shutDown();
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -1,11 +1,15 @@
|
||||
package nu.marginalia.livecrawler;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -15,29 +19,34 @@ import org.mockito.Mockito;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
class SimpleLinkScraperTest {
|
||||
private Path tempDir;
|
||||
private LiveCrawlDataSet dataSet;
|
||||
private CloseableHttpClient httpClient;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, SQLException {
|
||||
public void setUp() throws IOException, SQLException, NoSuchAlgorithmException, KeyManagementException {
|
||||
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
dataSet = new LiveCrawlDataSet(tempDir);
|
||||
httpClient = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
dataSet.close();
|
||||
httpClient.close(CloseMode.IMMEDIATE);
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetrieveNow() throws Exception {
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(), null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(1, fetched);
|
||||
|
||||
@@ -57,7 +66,7 @@ class SimpleLinkScraperTest {
|
||||
@Test
|
||||
public void testRetrieveNow_Redundant() throws Exception {
|
||||
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
|
||||
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
|
12
code/processes/new-domain-process/README.md
Normal file
12
code/processes/new-domain-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The new domain process (NDP) is a process that evaluates new domains for
|
||||
inclusion in the search engine index.
|
||||
|
||||
It visits the root document of each candidate domain, ensures that it's reachable,
|
||||
verifies that the response is valid HTML, and checks for a few factors such as length
|
||||
and links before deciding whether to assign the domain to a node.
|
||||
|
||||
The NDP process will assign new domains to the node with the fewest assigned domains.
|
||||
|
||||
The NDP process is triggered with a goal target number of domains to process, and
|
||||
will find domains until that target is reached. If e.g. a goal of 100 is set,
|
||||
and 50 are in the index, it will find 50 more domains.
|
75
code/processes/new-domain-process/build.gradle
Normal file
75
code/processes/new-domain-process/build.gradle
Normal file
@@ -0,0 +1,75 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,146 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.ndp.io.HttpClientProvider;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Evaluates a domain to determine if it is worth indexing.
|
||||
* This class fetches the root document, checks the response code, content type,
|
||||
* and parses the HTML to ensure it smells alright.
|
||||
*/
|
||||
@Singleton
|
||||
public class DomainEvaluator {
|
||||
private final HttpClient client;
|
||||
private final String userAgentString = WmsaHome.getUserAgent().uaString();
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
@Inject
|
||||
public DomainEvaluator(DomainCoordinator domainCoordinator) throws NoSuchAlgorithmException, KeyManagementException {
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
client = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
public boolean evaluateDomain(String domainName) {
|
||||
var edgeDomain = new EdgeDomain(domainName);
|
||||
|
||||
// Grab a lock on the domain to prevent concurrent evaluations between processes
|
||||
try (var lock = domainCoordinator.lockDomain(edgeDomain)) {
|
||||
var rootUrl = edgeDomain.toRootUrlHttps();
|
||||
|
||||
var request = ClassicRequestBuilder.get(rootUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
|
||||
.build();
|
||||
|
||||
return client.execute(request, (rsp) -> {
|
||||
if (rsp.getEntity() == null)
|
||||
return false;
|
||||
|
||||
try {
|
||||
// Check if the response code indicates a successful fetch
|
||||
if (200 != rsp.getCode()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
byte[] content;
|
||||
// Read the content from the response entity
|
||||
try (InputStream contentStream = rsp.getEntity().getContent()) {
|
||||
content = contentStream.readNBytes(8192);
|
||||
}
|
||||
|
||||
// Parse the content (if it's valid)
|
||||
ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
|
||||
|
||||
// Validate the content type
|
||||
if (!contentType.contentType().startsWith("text/html") && !contentType.contentType().startsWith("application/xhtml+xml"))
|
||||
return false;
|
||||
|
||||
// Parse the document body to a Jsoup Document
|
||||
final Document document = Jsoup.parse(DocumentBodyToString.getStringData(contentType, content));
|
||||
final String text = document.body().text();
|
||||
|
||||
if (text.length() < 100)
|
||||
return false;
|
||||
if (text.contains("404 Not Found") || text.contains("Page not found"))
|
||||
return false;
|
||||
if (hasMetaRefresh(document))
|
||||
return false; // This almost always indicates a parked domain
|
||||
if (!hasInternalLink(document, edgeDomain, rootUrl))
|
||||
return false; // No internal links means it's not worth indexing
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
finally {
|
||||
// May or may not be necessary, but let's ensure we clean up the response entity
|
||||
// to avoid resource leaks
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
|
||||
// Sleep for a while before yielding the lock, to avoid immediately hammering the domain
|
||||
// from another process
|
||||
sleepQuietly(Duration.ofSeconds(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return false; // If we fail to fetch or parse the domain, we consider it invalid
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasInternalLink(Document document, EdgeDomain currentDomain, EdgeUrl rootUrl) {
|
||||
for (Element atag : document.select("a")) {
|
||||
Optional<EdgeDomain> destDomain = linkParser
|
||||
.parseLink(rootUrl, atag)
|
||||
.map(EdgeUrl::getDomain);
|
||||
|
||||
if (destDomain.isPresent() && Objects.equals(currentDomain, destDomain.get()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasMetaRefresh(Document document) {
|
||||
for (Element metaTag : document.select("meta")) {
|
||||
if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv")))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void sleepQuietly(Duration duration) {
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(duration.toMillis());
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,134 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
|
||||
/** DomainAllocator is responsible for assigning domains to partitions/nodes.
|
||||
* This is ensured to make sure that domains are evenly distributed across the nodes.
|
||||
*/
|
||||
public class DomainNodeAllocator {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
|
||||
|
||||
private volatile boolean initialized = false;
|
||||
|
||||
private record NodeCount(int nodeId, int count)
|
||||
implements Comparable<NodeCount>
|
||||
{
|
||||
public NodeCount incrementCount() {
|
||||
return new NodeCount(nodeId, count + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull DomainNodeAllocator.NodeCount o) {
|
||||
return Integer.compare(this.count, o.count);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainNodeAllocator(NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainNodeAllocator::initialize()")
|
||||
.start(this::initialize);
|
||||
}
|
||||
|
||||
public synchronized int totalCount() {
|
||||
ensureInitialized();
|
||||
return countPerNode.stream().mapToInt(NodeCount::count).sum();
|
||||
}
|
||||
|
||||
/** Returns the next node ID to assign a domain to.
|
||||
* This method is synchronized to ensure thread safety when multiple threads are allocating domains.
|
||||
* The node ID returned is guaranteed to be one of the viable nodes configured in the system.
|
||||
*/
|
||||
public synchronized int nextNodeId() {
|
||||
ensureInitialized();
|
||||
|
||||
// Synchronized is fine here as this is not a hot path
|
||||
// (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
|
||||
|
||||
NodeCount allocation = countPerNode.remove();
|
||||
countPerNode.add(allocation.incrementCount());
|
||||
return allocation.nodeId();
|
||||
}
|
||||
|
||||
|
||||
private void ensureInitialized() {
|
||||
if (initialized) return;
|
||||
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
try {
|
||||
// Wait until the initialization is complete
|
||||
this.wait(1000);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("DomainAllocator initialization interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void initialize() {
|
||||
if (initialized) return;
|
||||
|
||||
Set<Integer> viableNodes = new HashSet<>();
|
||||
|
||||
// Find all viable nodes that can handle batch crawls
|
||||
for (var node : nodeConfigurationService.getAll()) {
|
||||
if (node.disabled())
|
||||
continue;
|
||||
if (!node.autoAssignDomains())
|
||||
continue;
|
||||
|
||||
if (node.profile().permitBatchCrawl())
|
||||
viableNodes.add(node.node());
|
||||
}
|
||||
|
||||
// Fetch the current counts of domains per node from the database
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT COUNT(*) AS CNT, NODE_AFFINITY
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
GROUP BY NODE_AFFINITY
|
||||
"""))
|
||||
{
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
|
||||
int nodeId = rs.getInt("NODE_AFFINITY");
|
||||
int count = rs.getInt("CNT");
|
||||
|
||||
if (viableNodes.remove(nodeId)) {
|
||||
countPerNode.add(new NodeCount(nodeId, count));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load domain counts from database", e);
|
||||
}
|
||||
|
||||
// Add any remaining viable nodes that were not found in the database
|
||||
for (int nodeId : viableNodes) {
|
||||
countPerNode.add(new NodeCount(nodeId, 0));
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,240 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class DomainTestingQueue {
|
||||
private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);
|
||||
|
||||
private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(2);
|
||||
|
||||
// This will grow quite large, but should be manageable in memory, as theoretical maximum is around 100M domains,
|
||||
// order of 2 GB in memory.
|
||||
private final ConcurrentHashMap<String, Boolean> takenDomains = new ConcurrentHashMap<>();
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainTestingQueue(HikariDataSource dataSource,
|
||||
AggregateLinkGraphClient linkGraphClient
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainTestingQueue::fetch()")
|
||||
.start(this::fetch);
|
||||
}
|
||||
|
||||
public DomainToTest next() throws InterruptedException {
|
||||
return queue.take();
|
||||
}
|
||||
|
||||
public void accept(DomainToTest domain, int nodeId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var flagOkStmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='ACCEPTED'
|
||||
WHERE DOMAIN_ID=?
|
||||
""");
|
||||
var assignNodeStmt = conn.prepareStatement("""
|
||||
UPDATE EC_DOMAIN SET NODE_AFFINITY=?
|
||||
WHERE ID=?
|
||||
AND EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
flagOkStmt.setInt(1, domain.domainId());
|
||||
flagOkStmt.executeUpdate();
|
||||
|
||||
assignNodeStmt.setInt(1, nodeId);
|
||||
assignNodeStmt.setInt(2, domain.domainId());
|
||||
assignNodeStmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to accept domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void reject(DomainToTest domain) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='REJECTED', CHECK_COUNT=CHECK_COUNT + 1
|
||||
WHERE DOMAIN_ID=?
|
||||
"""))
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
stmt.setInt(1, domain.domainId());
|
||||
stmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to reject domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void fetch() {
|
||||
while (true) {
|
||||
List<DomainToTest> domains = new ArrayList<>(2000);
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM NDP_NEW_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON ID=DOMAIN_ID
|
||||
WHERE NDP_NEW_DOMAINS.STATE = 'NEW'
|
||||
ORDER BY PRIORITY DESC
|
||||
LIMIT 2000
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
if (takenDomains.put(domainName, true) != null) {
|
||||
logger.warn("Domain {} is already processed, skipping", domainName);
|
||||
continue; // Skip if already taken
|
||||
}
|
||||
domains.add(new DomainToTest(domainName, domainId));
|
||||
}
|
||||
|
||||
if (domains.isEmpty()) {
|
||||
if (!refreshQueue(conn)) {
|
||||
throw new RuntimeException("No new domains found, aborting!");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (RuntimeException e) {
|
||||
throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to fetch domains from database", e);
|
||||
}
|
||||
|
||||
try {
|
||||
for (var domain : domains) {
|
||||
queue.put(domain);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("Domain fetching interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean refreshQueue(Connection conn) {
|
||||
logger.info("Refreshing domain queue in database");
|
||||
|
||||
Int2IntMap domainIdToCount = new Int2IntOpenHashMap();
|
||||
|
||||
// Load known domain IDs from the database to avoid inserting duplicates from NDP_NEW_DOMAINS
|
||||
// or domains that are already assigned to a node
|
||||
{
|
||||
IntOpenHashSet knownIds = new IntOpenHashSet();
|
||||
|
||||
try (var stmt = conn.createStatement()) {
|
||||
ResultSet rs = stmt.executeQuery("SELECT DOMAIN_ID FROM NDP_NEW_DOMAINS");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load known domain IDs from database", e);
|
||||
}
|
||||
|
||||
// Ensure the link graph is ready before proceeding. This is mainly necessary in a cold reboot
|
||||
// of the entire system.
|
||||
try {
|
||||
logger.info("Waiting for link graph client to be ready...");
|
||||
linkGraphClient.waitReady(Duration.ofHours(1));
|
||||
logger.info("Link graph client is ready, fetching domain links...");
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// Fetch all domain links from the link graph and count by how many sources each dest domain is linked from
|
||||
var iter = linkGraphClient.getAllDomainLinks().iterator();
|
||||
while (iter.advance()) {
|
||||
int dest = iter.dest();
|
||||
if (!knownIds.contains(dest)) {
|
||||
domainIdToCount.mergeInt(dest, 1, (i, j) -> i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean didInsert = false;
|
||||
|
||||
/* Insert new domains into NDP_NEW_DOMAINS table */
|
||||
try (var insertStmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
|
||||
""")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0;
|
||||
for (var entry : domainIdToCount.int2IntEntrySet()) {
|
||||
int domainId = entry.getIntKey();
|
||||
int count = entry.getIntValue();
|
||||
|
||||
insertStmt.setInt(1, domainId);
|
||||
insertStmt.setInt(2, count);
|
||||
insertStmt.addBatch();
|
||||
|
||||
if (++cnt >= 1000) {
|
||||
cnt = 0;
|
||||
insertStmt.executeBatch(); // Execute in batches to avoid memory issues
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
}
|
||||
if (cnt != 0) {
|
||||
insertStmt.executeBatch(); // Execute any remaining batch
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
|
||||
logger.info("Queue refreshed successfully");
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to refresh queue in database", e);
|
||||
}
|
||||
|
||||
// Clean up NDP_NEW_DOMAINS table to remove any domains that are already in EC_DOMAIN
|
||||
// This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
|
||||
// repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
|
||||
try (var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);
|
||||
}
|
||||
|
||||
return didInsert;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,159 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ndp.NdpRequest;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.Security;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class NdpMain extends ProcessMainClass {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NdpMain.class);
|
||||
private final DomainNodeAllocator domainNodeAllocator;
|
||||
private final DomainTestingQueue domainTestingQueue;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
private final DomainEvaluator domainEvaluator;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
|
||||
private final AtomicInteger domainCount = new AtomicInteger(0);
|
||||
|
||||
@Inject
|
||||
public NdpMain(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
DomainNodeAllocator domainNodeAllocator,
|
||||
DomainTestingQueue domainTestingQueue,
|
||||
DomainEvaluator domainEvaluator,
|
||||
DomainBlacklist domainBlacklist,
|
||||
ProcessHeartbeat processHeartbeat,
|
||||
Gson gson)
|
||||
{
|
||||
super(messageQueueFactory, config, gson, ProcessInboxNames.NDP_INBOX);
|
||||
|
||||
this.domainNodeAllocator = domainNodeAllocator;
|
||||
this.domainEvaluator = domainEvaluator;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.domainTestingQueue = domainTestingQueue;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
|
||||
public void run(int goalCount) throws InterruptedException {
|
||||
logger.info("Wait for blacklist to load...");
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
|
||||
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(
|
||||
"NDP-Worker",
|
||||
8,
|
||||
10,
|
||||
SimpleBlockingThreadPool.ThreadType.PLATFORM
|
||||
);
|
||||
|
||||
logger.info("Starting NDP process");
|
||||
|
||||
int toInsertCount = goalCount - domainNodeAllocator.totalCount();
|
||||
|
||||
if (toInsertCount <= 0) {
|
||||
logger.info("No new domains to process. Current count: " + domainNodeAllocator.totalCount());
|
||||
return;
|
||||
}
|
||||
|
||||
try (var hb = processHeartbeat.createAdHocTaskHeartbeat("Growing Index")) {
|
||||
int cnt;
|
||||
while ((cnt = domainCount.get()) < toInsertCount) {
|
||||
if (cnt % 100 == 0) {
|
||||
hb.progress("Discovery Process", cnt, toInsertCount);
|
||||
}
|
||||
|
||||
final DomainToTest nextDomain = domainTestingQueue.next();
|
||||
threadPool.submit(() -> {
|
||||
try {
|
||||
if (domainEvaluator.evaluateDomain(nextDomain.domainName())) {
|
||||
logger.info("Accepting: {}", nextDomain.domainName());
|
||||
domainCount.incrementAndGet();
|
||||
domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
|
||||
} else {
|
||||
logger.info("Rejecting: {}", nextDomain.domainName());
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
threadPool.shutDown();
|
||||
// Wait for all tasks to complete or give up after 1 hour
|
||||
threadPool.awaitTermination(1, TimeUnit.HOURS);
|
||||
|
||||
logger.info("NDP process completed. Total domains processed: " + domainCount.get());
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||
|
||||
// This must run *early*
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
||||
// If these aren't set properly, the JVM will hang forever on some requests
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new NdpModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new ProcessConfigurationModule("ndp"),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
|
||||
|
||||
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
|
||||
|
||||
NdpMain main = injector.getInstance(NdpMain.class);
|
||||
|
||||
var instructions = main.fetchInstructions(NdpRequest.class);
|
||||
|
||||
try {
|
||||
main.run(instructions.value().goal());
|
||||
instructions.ok();
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.error("Error running ping process", ex);
|
||||
instructions.err();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
|
||||
public class NdpModule extends AbstractModule {
|
||||
public void configure() {
|
||||
}
|
||||
}
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ndp.model;
|
||||
|
||||
public record DomainToTest(String domainName, int domainId) {
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class DomainEvaluatorTest {
|
||||
|
||||
@Tag("flaky") // Exclude from CI runs due to potential network issues
|
||||
@Test
|
||||
public void testSunnyDay() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
DomainEvaluator evaluator = new DomainEvaluator(new LocalDomainCoordinator());
|
||||
|
||||
// Should be a valid domain
|
||||
assertTrue(evaluator.evaluateDomain("www.marginalia.nu"));
|
||||
|
||||
// Should be a redirect to www.marginalia.nu
|
||||
assertFalse(evaluator.evaluateDomain("memex.marginalia.nu"));
|
||||
|
||||
// Should fail on Anubis
|
||||
assertFalse(evaluator.evaluateDomain("marginalia-search.com"));
|
||||
}
|
||||
}
|
12
code/processes/ping-process/README.md
Normal file
12
code/processes/ping-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The ping process (which has nothing to do with ICMP ping) keeps track of
|
||||
the aliveness of websites. It also gathers fingerprint information about
|
||||
the security posture of the website, as well as DNS information.
|
||||
|
||||
This is kept to build an idea of when a website is down, and to identify
|
||||
ownership changes, as well as other significant events in the lifecycle
|
||||
of a website.
|
||||
|
||||
# Central Classes
|
||||
|
||||
* [PingMain](java/nu/marginalia/ping/PingMain.java) main class.
|
||||
* [PingJobScheduler](java/nu/marginalia/ping/PingJobScheduler.java) service that dispatches pings.
|
72
code/processes/ping-process/build.gradle
Normal file
72
code/processes/ping-process/build.gradle
Normal file
@@ -0,0 +1,72 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation 'org.bouncycastle:bcprov-jdk18on:1.80'
|
||||
implementation 'org.bouncycastle:bcpkix-jdk18on:1.80'
|
||||
implementation 'dnsjava:dnsjava:3.5.2'
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,84 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
public class BackoffStrategy {
|
||||
|
||||
private final Map<ErrorClassification, Duration> baseIntervals;
|
||||
private final Map<ErrorClassification, Duration> maxIntervals;
|
||||
private final Duration okInterval;
|
||||
|
||||
@Inject
|
||||
public BackoffStrategy(PingIntervalsConfiguration pingIntervalsConfiguration) {
|
||||
this.baseIntervals = pingIntervalsConfiguration.baseIntervals();
|
||||
this.maxIntervals = pingIntervalsConfiguration.maxIntervals();
|
||||
this.okInterval = baseIntervals.get(ErrorClassification.NONE);
|
||||
}
|
||||
|
||||
public Duration getOkInterval() {
|
||||
return okInterval;
|
||||
}
|
||||
|
||||
public Duration getUpdateTime(Duration currentDuration,
|
||||
ErrorClassification errorClassification,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
Duration nextBackoff = calculateBackoff(errorClassification, currentDuration, backoffConsecutiveFailures + 1);
|
||||
nextBackoff = addJitter(nextBackoff);
|
||||
|
||||
return nextBackoff;
|
||||
}
|
||||
|
||||
private Duration calculateBackoff(ErrorClassification errorClassification,
|
||||
Duration currentDuration,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
if (currentDuration == null) {
|
||||
return baseIntervals.get(errorClassification);
|
||||
}
|
||||
|
||||
Duration baseInterval = baseIntervals.get(errorClassification);
|
||||
Duration maxInterval = maxIntervals.get(errorClassification);
|
||||
|
||||
if (currentDuration.compareTo(maxInterval) >= 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
double multiplier = switch(errorClassification) {
|
||||
case ErrorClassification.UNKNOWN -> 1.5;
|
||||
case ErrorClassification.TIMEOUT -> 2.5;
|
||||
case ErrorClassification.CONNECTION_ERROR -> 2.0;
|
||||
case ErrorClassification.HTTP_CLIENT_ERROR -> 1.7;
|
||||
case ErrorClassification.HTTP_SERVER_ERROR -> 2.0;
|
||||
case ErrorClassification.SSL_ERROR -> 1.8;
|
||||
case ErrorClassification.DNS_ERROR -> 1.5;
|
||||
default -> 2.0; // Default multiplier for any other classification
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
// Add ±15% jitter to prevent synchronized retries
|
||||
double jitterPercent = 0.15;
|
||||
long baseMinutes = duration.toMinutes();
|
||||
long jitterRange = (long) (baseMinutes * jitterPercent * 2);
|
||||
long jitterOffset = ThreadLocalRandom.current().nextLong(jitterRange + 1) - (jitterRange / 2);
|
||||
|
||||
long finalMinutes = Math.max(1, baseMinutes + jitterOffset);
|
||||
return Duration.ofMinutes(finalMinutes);
|
||||
}
|
||||
}
|
259
code/processes/ping-process/java/nu/marginalia/ping/PingDao.java
Normal file
259
code/processes/ping-process/java/nu/marginalia/ping/PingDao.java
Normal file
@@ -0,0 +1,259 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.ping.model.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
@Singleton
|
||||
public class PingDao {
|
||||
private final HikariDataSource dataSource;
|
||||
private static final Gson gson = GsonFactory.get();
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingDao.class);
|
||||
|
||||
@Inject
|
||||
public PingDao(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public void write(WritableModel model) {
|
||||
write(List.of(model));
|
||||
}
|
||||
|
||||
public void write(Collection<WritableModel> models) {
|
||||
logger.debug("Writing: {}", models);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
|
||||
// Don't bother with a transaction if there's only one model to write.
|
||||
if (models.size() <= 1) {
|
||||
for (WritableModel model : models) {
|
||||
model.write(conn);
|
||||
}
|
||||
}
|
||||
else { // If there are multiple models, use a transaction to ensure atomicity.
|
||||
conn.setAutoCommit(false);
|
||||
try {
|
||||
for (WritableModel model : models) {
|
||||
model.write(conn);
|
||||
}
|
||||
conn.commit();
|
||||
} catch (SQLException e) {
|
||||
conn.rollback();
|
||||
throw e;
|
||||
} finally {
|
||||
conn.setAutoCommit(true);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to write model", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void scheduleDnsUpdate(String rootDomainName, Instant timestamp, int priority) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
UPDATE DOMAIN_DNS_INFORMATION
|
||||
SET TS_NEXT_DNS_CHECK = ?, DNS_CHECK_PRIORITY = ?
|
||||
WHERE ROOT_DOMAIN_NAME = ?
|
||||
""")) {
|
||||
|
||||
ps.setTimestamp(1, java.sql.Timestamp.from(timestamp));
|
||||
ps.setInt(2, priority);
|
||||
ps.setString(3, rootDomainName);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public DomainAvailabilityRecord getDomainPingStatus(int domainId) throws SQLException {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_AVAILABILITY_INFORMATION WHERE domain_id = ?")) {
|
||||
|
||||
ps.setInt(1, domainId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainAvailabilityRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public DomainSecurityRecord getDomainSecurityInformation(int domainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_SECURITY_INFORMATION WHERE domain_id = ?")) {
|
||||
|
||||
ps.setInt(1, domainId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainSecurityRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public DomainDnsRecord getDomainDnsRecord(long dnsRootDomainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_DNS_INFORMATION WHERE DNS_ROOT_DOMAIN_ID = ?")) {
|
||||
|
||||
ps.setObject(1, dnsRootDomainId, java.sql.Types.INTEGER);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainDnsRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public DomainDnsRecord getDomainDnsRecord(String rootDomainName) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_DNS_INFORMATION WHERE ROOT_DOMAIN_NAME = ?")) {
|
||||
|
||||
ps.setString(1, rootDomainName);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainDnsRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public HistoricalAvailabilityData getHistoricalAvailabilityData(long domainId) throws SQLException {
|
||||
var query = """
|
||||
SELECT EC_DOMAIN.ID, EC_DOMAIN.DOMAIN_NAME, EC_DOMAIN.NODE_AFFINITY, DOMAIN_AVAILABILITY_INFORMATION.*, DOMAIN_SECURITY_INFORMATION.*
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_SECURITY_INFORMATION ON DOMAIN_SECURITY_INFORMATION.DOMAIN_ID = EC_DOMAIN.ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION ON DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID = EC_DOMAIN.ID
|
||||
WHERE EC_DOMAIN.ID = ?
|
||||
""";
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement(query)) {
|
||||
|
||||
ps.setLong(1, domainId);
|
||||
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
String domainName = rs.getString("EC_DOMAIN.DOMAIN_NAME");
|
||||
|
||||
DomainAvailabilityRecord dar;
|
||||
DomainSecurityRecord dsr;
|
||||
|
||||
if (rs.getObject("DOMAIN_SECURITY_INFORMATION.DOMAIN_ID", Integer.class) != null)
|
||||
dsr = new DomainSecurityRecord(rs);
|
||||
else
|
||||
dsr = null;
|
||||
|
||||
if (rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID", Integer.class) != null)
|
||||
dar = new DomainAvailabilityRecord(rs);
|
||||
else
|
||||
dar = null;
|
||||
|
||||
if (dar == null) {
|
||||
return new HistoricalAvailabilityData.JustDomainReference(new DomainReference(
|
||||
rs.getInt("EC_DOMAIN.ID"),
|
||||
rs.getInt("EC_DOMAIN.NODE_AFFINITY"),
|
||||
domainName.toLowerCase()
|
||||
));
|
||||
}
|
||||
else {
|
||||
if (dsr != null) {
|
||||
return new HistoricalAvailabilityData.AvailabilityAndSecurity(domainName, dar, dsr);
|
||||
} else {
|
||||
return new HistoricalAvailabilityData.JustAvailability(domainName, dar);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT ID, DOMAIN_NAME, NEXT_SCHEDULED_UPDATE
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION
|
||||
ON EC_DOMAIN.ID = DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID
|
||||
WHERE NODE_AFFINITY = ?
|
||||
""")) {
|
||||
ps.setFetchSize(10_000);
|
||||
ps.setInt(1, nodeId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
var ts = rs.getTimestamp("NEXT_SCHEDULED_UPDATE");
|
||||
Instant nextUpdate = ts == null ? Instant.now() : ts.toInstant();
|
||||
|
||||
var ref = new DomainReference(domainId, nodeId, domainName.toLowerCase());
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(ref, nextUpdate));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to retrieve domain update schedule", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} availability update jobs for node {}", updateJobs.size(), nodeId);
|
||||
|
||||
return updateJobs;
|
||||
}
|
||||
|
||||
public List<UpdateSchedule.UpdateJob<RootDomainReference, RootDomainReference>> getDnsUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<RootDomainReference, RootDomainReference>> updateJobs = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT DISTINCT(DOMAIN_TOP),DOMAIN_DNS_INFORMATION.* FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_DNS_INFORMATION ON ROOT_DOMAIN_NAME = DOMAIN_TOP
|
||||
WHERE EC_DOMAIN.NODE_AFFINITY = ?
|
||||
""")) {
|
||||
ps.setFetchSize(10_000);
|
||||
ps.setInt(1, nodeId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
Long dnsRootDomainId = rs.getObject("DOMAIN_DNS_INFORMATION.DNS_ROOT_DOMAIN_ID", Long.class);
|
||||
String rootDomainName = rs.getString("DOMAIN_TOP");
|
||||
|
||||
if (dnsRootDomainId == null) {
|
||||
updateJobs.add(
|
||||
new UpdateSchedule.UpdateJob<>(
|
||||
new RootDomainReference.ByName(rootDomainName),
|
||||
Instant.now())
|
||||
);
|
||||
}
|
||||
else {
|
||||
var record = new DomainDnsRecord(rs);
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(
|
||||
new RootDomainReference.ByIdAndName(dnsRootDomainId, rootDomainName),
|
||||
Objects.requireNonNullElseGet(record.tsNextScheduledUpdate(), Instant::now))
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to retrieve DNS update schedule", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} dns update jobs for node {}", updateJobs.size(), nodeId);
|
||||
|
||||
return updateJobs;
|
||||
}
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
|
||||
public record PingIntervalsConfiguration(
|
||||
Duration dnsUpdateInterval,
|
||||
Map<ErrorClassification, Duration> baseIntervals,
|
||||
Map<ErrorClassification, Duration> maxIntervals
|
||||
) {
|
||||
}
|
@@ -0,0 +1,297 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.ping.model.*;
|
||||
import nu.marginalia.ping.svc.DnsPingService;
|
||||
import nu.marginalia.ping.svc.HttpPingService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** PingJobScheduler is responsible for scheduling and processing ping jobs
|
||||
* for both HTTP pings and DNS lookups. It manages a queue of jobs and processes them
|
||||
* in separate threads, ensuring that domains are pinged and DNS records are updated
|
||||
* efficiently.
|
||||
*/
|
||||
public class PingJobScheduler {
|
||||
private final HttpPingService httpPingService;
|
||||
private final DnsPingService dnsPingService;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final PingDao pingDao;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingJobScheduler.class);
|
||||
|
||||
private static final UpdateSchedule<RootDomainReference, RootDomainReference> dnsUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
private static final UpdateSchedule<DomainReference, HistoricalAvailabilityData> availabilityUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
|
||||
public volatile Instant dnsLastSync = Instant.now();
|
||||
public volatile Instant availabilityLastSync = Instant.now();
|
||||
|
||||
public volatile Integer nodeId = null;
|
||||
public volatile boolean running = false;
|
||||
|
||||
private final List<Thread> allThreads = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
public PingJobScheduler(HttpPingService httpPingService,
|
||||
DnsPingService dnsPingService,
|
||||
DomainCoordinator domainCoordinator,
|
||||
PingDao pingDao)
|
||||
{
|
||||
this.httpPingService = httpPingService;
|
||||
this.dnsPingService = dnsPingService;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.pingDao = pingDao;
|
||||
}
|
||||
|
||||
public synchronized void start() {
|
||||
if (running)
|
||||
return;
|
||||
|
||||
nodeId = null;
|
||||
|
||||
running = true;
|
||||
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("sync-dns").start(this::syncAvailabilityJobs));
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("sync-availability").start(this::syncDnsRecords));
|
||||
|
||||
int availabilityThreads = Integer.getInteger("ping.availabilityThreads", 8);
|
||||
int pingThreads = Integer.getInteger("ping.dnsThreads", 2);
|
||||
|
||||
for (int i = 0; i < availabilityThreads; i++) {
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("availability-job-consumer-" + i).start(this::availabilityJobConsumer));
|
||||
}
|
||||
for (int i = 0; i < pingThreads; i++) {
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("dns-job-consumer-" + i).start(this::dnsJobConsumer));
|
||||
}
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
running = false;
|
||||
for (Thread thread : allThreads) {
|
||||
try {
|
||||
thread.interrupt();
|
||||
thread.join();
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("Failed to join thread: " + thread.getName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void pause(int nodeId) {
|
||||
logger.info("Pausing PingJobScheduler for nodeId: {}", nodeId);
|
||||
|
||||
if (this.nodeId != null && this.nodeId != nodeId) {
|
||||
logger.warn("Attempted to pause PingJobScheduler with mismatched nodeId: expected {}, got {}", this.nodeId, nodeId);
|
||||
return;
|
||||
}
|
||||
this.nodeId = null;
|
||||
|
||||
availabilityUpdateSchedule.clear();
|
||||
dnsUpdateSchedule.clear();
|
||||
|
||||
logger.info("PingJobScheduler paused");
|
||||
}
|
||||
|
||||
public synchronized void enableForNode(int nodeId) {
|
||||
logger.info("Resuming PingJobScheduler for nodeId: {}", nodeId);
|
||||
if (this.nodeId != null) {
|
||||
logger.warn("Attempted to resume PingJobScheduler with mismatched nodeId: expected {}, got {}", this.nodeId, nodeId);
|
||||
return;
|
||||
}
|
||||
|
||||
availabilityUpdateSchedule.replaceQueue(pingDao.getDomainUpdateSchedule(nodeId));
|
||||
dnsUpdateSchedule.replaceQueue(pingDao.getDnsUpdateSchedule(nodeId));
|
||||
dnsLastSync = Instant.now();
|
||||
availabilityLastSync = Instant.now();
|
||||
|
||||
// Flag that we are running again
|
||||
this.nodeId = nodeId;
|
||||
|
||||
notifyAll();
|
||||
logger.info("PingJobScheduler resumed");
|
||||
}
|
||||
|
||||
public synchronized void waitForResume() throws InterruptedException {
|
||||
while (nodeId == null) {
|
||||
wait();
|
||||
}
|
||||
}
|
||||
|
||||
private void availabilityJobConsumer() {
|
||||
while (running) {
|
||||
try {
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
DomainReference ref = availabilityUpdateSchedule.nextIf(domain -> {
|
||||
EdgeDomain domainObj = new EdgeDomain(domain.domainName());
|
||||
if (!domainCoordinator.isLockableHint(domainObj)) {
|
||||
return false; // Skip locked domains
|
||||
}
|
||||
return true; // Process this domain
|
||||
});
|
||||
|
||||
long nextId = ref.domainId();
|
||||
var data = pingDao.getHistoricalAvailabilityData(nextId);
|
||||
if (data == null) {
|
||||
logger.warn("No availability data found for ID: {}", nextId);
|
||||
continue; // No data to process, skip this iteration
|
||||
}
|
||||
|
||||
try {
|
||||
List<WritableModel> objects = switch (data) {
|
||||
case HistoricalAvailabilityData.JustDomainReference(DomainReference reference)
|
||||
-> httpPingService.pingDomain(reference, null, null);
|
||||
case HistoricalAvailabilityData.JustAvailability(String domain, DomainAvailabilityRecord record)
|
||||
-> httpPingService.pingDomain(
|
||||
new DomainReference(record.domainId(), record.nodeId(), domain), record, null);
|
||||
case HistoricalAvailabilityData.AvailabilityAndSecurity(String domain, DomainAvailabilityRecord availability, DomainSecurityRecord security)
|
||||
-> httpPingService.pingDomain(
|
||||
new DomainReference(availability.domainId(), availability.nodeId(), domain), availability, security);
|
||||
};
|
||||
|
||||
pingDao.write(objects);
|
||||
|
||||
// Re-schedule the next update time for the domain
|
||||
for (var object : objects) {
|
||||
var ts = object.nextUpdateTime();
|
||||
if (ts != null) {
|
||||
availabilityUpdateSchedule.add(ref, ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error processing availability job for domain: " + data.domain(), e);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("Availability job consumer interrupted", e);
|
||||
break;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error processing availability job", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void dnsJobConsumer() {
|
||||
while (running) {
|
||||
try {
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
RootDomainReference ref = dnsUpdateSchedule.next();
|
||||
|
||||
try {
|
||||
List<WritableModel> objects = switch(ref) {
|
||||
case RootDomainReference.ByIdAndName(long id, String name) -> {
|
||||
var oldRecord = Objects.requireNonNull(pingDao.getDomainDnsRecord(id));
|
||||
yield dnsPingService.pingDomain(oldRecord.rootDomainName(), oldRecord);
|
||||
}
|
||||
case RootDomainReference.ByName(String name) -> {
|
||||
@Nullable var oldRecord = pingDao.getDomainDnsRecord(name);
|
||||
yield dnsPingService.pingDomain(name, oldRecord);
|
||||
}
|
||||
};
|
||||
|
||||
pingDao.write(objects);
|
||||
|
||||
// Re-schedule the next update time for the domain
|
||||
for (var object : objects) {
|
||||
var ts = object.nextUpdateTime();
|
||||
if (ts != null) {
|
||||
dnsUpdateSchedule.add(ref, ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error processing DNS job for domain: " + ref, e);
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS job consumer interrupted", e);
|
||||
break;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error processing DNS job", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void syncAvailabilityJobs() {
|
||||
try {
|
||||
while (running) {
|
||||
|
||||
// If we are suspended, wait for resume
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if we need to refresh the availability data
|
||||
Instant nextRefresh = availabilityLastSync.plus(Duration.ofHours(24));
|
||||
if (Instant.now().isBefore(nextRefresh)) {
|
||||
Duration remaining = Duration.between(Instant.now(), nextRefresh);
|
||||
TimeUnit.MINUTES.sleep(Math.max(1, remaining.toMinutes()));
|
||||
continue;
|
||||
}
|
||||
|
||||
availabilityUpdateSchedule.replaceQueue(pingDao.getDomainUpdateSchedule(nid));
|
||||
availabilityLastSync = Instant.now();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching new ping jobs", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void syncDnsRecords() {
|
||||
try {
|
||||
while (running) {
|
||||
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue; // re-fetch the records after resuming
|
||||
}
|
||||
|
||||
// Check if we need to refresh the availability data
|
||||
Instant nextRefresh = dnsLastSync.plus(Duration.ofHours(24));
|
||||
if (Instant.now().isBefore(nextRefresh)) {
|
||||
Duration remaining = Duration.between(Instant.now(), nextRefresh);
|
||||
TimeUnit.MINUTES.sleep(Math.max(1, remaining.toMinutes()));
|
||||
continue;
|
||||
}
|
||||
|
||||
dnsUpdateSchedule.replaceQueue(pingDao.getDnsUpdateSchedule(nid));
|
||||
dnsLastSync = Instant.now();
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS job fetch interrupted", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,102 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.Security;
|
||||
|
||||
public class PingMain extends ProcessMainClass {
|
||||
private static final Logger log = LoggerFactory.getLogger(PingMain.class);
|
||||
|
||||
private final PingJobScheduler pingJobScheduler;
|
||||
private final int node;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingMain.class);
|
||||
|
||||
@Inject
|
||||
public PingMain(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
Gson gson,
|
||||
PingJobScheduler pingJobScheduler,
|
||||
ProcessConfiguration processConfiguration
|
||||
) {
|
||||
super(messageQueueFactory, config, gson, ProcessInboxNames.PING_INBOX);
|
||||
|
||||
this.pingJobScheduler = pingJobScheduler;
|
||||
this.node = processConfiguration.node();
|
||||
}
|
||||
|
||||
public void runPrimary() {
|
||||
log.info("Starting PingMain...");
|
||||
|
||||
// Start the ping job scheduler
|
||||
pingJobScheduler.start();
|
||||
pingJobScheduler.enableForNode(node);
|
||||
|
||||
log.info("PingMain started successfully.");
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||
|
||||
// This must run *early*
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
||||
// If these aren't set properly, the JVM will hang forever on some requests
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new PingModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new ProcessConfigurationModule("ping"),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
|
||||
|
||||
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
|
||||
|
||||
PingMain main = injector.getInstance(PingMain.class);
|
||||
|
||||
var instructions = main.fetchInstructions(PingRequest.class);
|
||||
|
||||
try {
|
||||
main.runPrimary();
|
||||
for(;;)
|
||||
synchronized (main) { // Wait on the object lock to avoid busy-looping
|
||||
main.wait();
|
||||
}
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.error("Error running ping process", ex);
|
||||
instructions.err();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,82 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.ping.io.HttpClientProvider;
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PingModule extends AbstractModule {
|
||||
|
||||
public PingModule() throws NoSuchAlgorithmException {
|
||||
}
|
||||
|
||||
public static PingIntervalsConfiguration createPingIntervalsConfiguration() {
|
||||
Map<ErrorClassification, Duration> initialTimeouts = new HashMap<>();
|
||||
Map<ErrorClassification, Duration> maxTimeouts = new HashMap<>();
|
||||
|
||||
for (var classification : ErrorClassification.values()) {
|
||||
switch (classification) {
|
||||
case CONNECTION_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(15));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case HTTP_CLIENT_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(15));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case HTTP_SERVER_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(8));
|
||||
maxTimeouts.put(classification, Duration.ofHours(6));
|
||||
}
|
||||
case SSL_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(45));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case DNS_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(60));
|
||||
maxTimeouts.put(classification, Duration.ofDays(7));
|
||||
}
|
||||
case TIMEOUT -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(5));
|
||||
maxTimeouts.put(classification, Duration.ofHours(6));
|
||||
}
|
||||
case UNKNOWN -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(30));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case NONE -> {
|
||||
initialTimeouts.put(classification, Duration.ofHours(6));
|
||||
maxTimeouts.put(classification, Duration.ofDays(6));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new PingIntervalsConfiguration(
|
||||
Duration.ofHours(3),
|
||||
initialTimeouts,
|
||||
maxTimeouts
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void configure() {
|
||||
bind(HttpClient.class).toProvider(HttpClientProvider.class);
|
||||
|
||||
bind(PingIntervalsConfiguration.class).toInstance(createPingIntervalsConfiguration());
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("ping.nameservers")
|
||||
public List<String> providePingNameservers() {
|
||||
// Google's public DNS servers currently have the best rate limiting
|
||||
return List.of("8.8.8.8", "8.8.4.4");
|
||||
}
|
||||
}
|
@@ -0,0 +1,109 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** In-memory schedule for updates, allowing jobs to be added and processed in order of their scheduled time.
|
||||
* This is not a particularly high-performance implementation, but exists to take contention off the database's
|
||||
* timestamp index.
|
||||
* */
|
||||
public class UpdateSchedule<T, T2> {
|
||||
private final PriorityQueue<UpdateJob<T, T2>> updateQueue;
|
||||
public record UpdateJob<T, T2>(T key, Instant updateTime) {}
|
||||
|
||||
public UpdateSchedule(int initialCapacity) {
|
||||
updateQueue = new PriorityQueue<>(initialCapacity, Comparator.comparing(UpdateJob::updateTime));
|
||||
}
|
||||
|
||||
public synchronized void add(T key, Instant updateTime) {
|
||||
updateQueue.add(new UpdateJob<>(key, updateTime));
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
/** Returns the next job in the queue that is due to be processed.
|
||||
* If no jobs are due, it will block until a job is added or a job becomes due.
|
||||
* */
|
||||
public synchronized T next() throws InterruptedException {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
wait(); // Wait for a new job to be added
|
||||
continue;
|
||||
}
|
||||
|
||||
UpdateJob<T, T2> job = updateQueue.peek();
|
||||
Instant now = Instant.now();
|
||||
|
||||
if (job.updateTime.isAfter(now)) {
|
||||
Duration toWait = Duration.between(now, job.updateTime);
|
||||
wait(Math.max(1, toWait.toMillis()));
|
||||
}
|
||||
else {
|
||||
updateQueue.poll(); // Remove the job from the queue since it's due
|
||||
return job.key();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Returns the first job in the queue matching the predicate that is not scheduled into the future,
|
||||
* blocking until a job is added or a job becomes due.
|
||||
*/
|
||||
public synchronized T nextIf(Predicate<T> predicate) throws InterruptedException {
|
||||
List<UpdateJob<T, T2>> rejectedJobs = new ArrayList<>();
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
wait(); // Wait for a new job to be added
|
||||
continue;
|
||||
}
|
||||
|
||||
UpdateJob<T, T2> job = updateQueue.peek();
|
||||
Instant now = Instant.now();
|
||||
|
||||
if (job.updateTime.isAfter(now)) {
|
||||
Duration toWait = Duration.between(now, job.updateTime);
|
||||
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
rejectedJobs.clear();
|
||||
|
||||
wait(Math.max(1, toWait.toMillis()));
|
||||
} else {
|
||||
var candidate = updateQueue.poll(); // Remove the job from the queue since it's due
|
||||
|
||||
assert candidate != null : "Update job should not be null at this point, since we just peeked it in a synchronized block";
|
||||
|
||||
if (!predicate.test(candidate.key())) {
|
||||
rejectedJobs.add(candidate);
|
||||
}
|
||||
else {
|
||||
return candidate.key();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
updateQueue.clear();
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
public synchronized void replaceQueue(Collection<UpdateJob<T,T2>> newJobs) {
|
||||
updateQueue.clear();
|
||||
updateQueue.addAll(newJobs);
|
||||
notifyAll();
|
||||
}
|
||||
}
|
@@ -0,0 +1,96 @@
|
||||
package nu.marginalia.ping.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.ping.model.SingleDnsRecord;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xbill.DNS.ExtendedResolver;
|
||||
import org.xbill.DNS.Lookup;
|
||||
import org.xbill.DNS.TextParseException;
|
||||
import org.xbill.DNS.Type;
|
||||
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
public class PingDnsFetcher {
|
||||
private final ThreadLocal<ExtendedResolver> resolver;
|
||||
private static final ExecutorService digExecutor = Executors.newFixedThreadPool(100);
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingDnsFetcher.class);
|
||||
|
||||
private static final int[] RECORD_TYPES = {
|
||||
Type.A, Type.AAAA, Type.NS, Type.MX, Type.TXT,
|
||||
Type.SOA, Type.CNAME, Type.CAA, Type.SPF
|
||||
};
|
||||
|
||||
@Inject
|
||||
public PingDnsFetcher(@Named("ping.nameservers")
|
||||
List<String> nameservers) {
|
||||
resolver = ThreadLocal.withInitial(() -> createResolver(nameservers));
|
||||
}
|
||||
|
||||
private ExtendedResolver createResolver(List<String> nameservers) {
|
||||
try {
|
||||
ExtendedResolver r = new ExtendedResolver(
|
||||
nameservers.toArray(new String[0])
|
||||
);
|
||||
r.setLoadBalance(true);
|
||||
r.setTimeout(Duration.ofSeconds(5));
|
||||
return r;
|
||||
}
|
||||
catch (UnknownHostException e) {
|
||||
throw new RuntimeException("Failed to create DNS resolver", e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<SingleDnsRecord> query(String domainName, int recordType) throws TextParseException {
|
||||
var resolver = this.resolver.get();
|
||||
var query = new Lookup(domainName, recordType);
|
||||
query.setResolver(resolver);
|
||||
|
||||
var result = query.run();
|
||||
|
||||
if (result == null || result.length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
List<SingleDnsRecord> records = new ArrayList<>(result.length);
|
||||
|
||||
for (var record : result) {
|
||||
if (record == null) continue;
|
||||
records.add(new SingleDnsRecord(
|
||||
Type.string(recordType),
|
||||
record.toString())
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
public List<SingleDnsRecord> dig(String domainName) {
|
||||
List<Callable<List<SingleDnsRecord>>> tasks = new ArrayList<>(RECORD_TYPES.length);
|
||||
for (var recordType : RECORD_TYPES) {
|
||||
tasks.add(() -> query(domainName, recordType));
|
||||
}
|
||||
List<SingleDnsRecord> results = new ArrayList<>(RECORD_TYPES.length);
|
||||
try {
|
||||
List<Future<List<SingleDnsRecord>>> futures = digExecutor.invokeAll(tasks);
|
||||
for (Future<List<SingleDnsRecord>> future : futures) {
|
||||
try {
|
||||
results.addAll(future.get(1, TimeUnit.MINUTES));
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching DNS records", e);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS query interrupted", e);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,95 @@
|
||||
package nu.marginalia.ping.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.ping.fetcher.response.*;
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.protocol.HttpClientContext;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
|
||||
import javax.net.ssl.SSLHandshakeException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PingHttpFetcher {
|
||||
private final UserAgent userAgent = WmsaHome.getUserAgent();
|
||||
private final HttpClient client;
|
||||
|
||||
@Inject
|
||||
public PingHttpFetcher(HttpClient client) {
|
||||
this.client = client;
|
||||
}
|
||||
|
||||
public PingRequestResponse fetchUrl(String url, Method method, String etag, String lastModified) {
|
||||
|
||||
var builder = ClassicRequestBuilder.create(method.name())
|
||||
.setUri(url)
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.addHeader("User-Agent", userAgent.uaString())
|
||||
.addHeader("Accept-Encoding", "gzip");
|
||||
if (etag != null) {
|
||||
builder.addHeader("If-None-Match", etag);
|
||||
}
|
||||
if (lastModified != null) {
|
||||
builder.addHeader("If-Modified-Since", lastModified);
|
||||
}
|
||||
|
||||
var req = builder.build();
|
||||
|
||||
HttpClientContext context = HttpClientContext.create();
|
||||
try {
|
||||
Instant start = Instant.now();
|
||||
return client.execute(req, context, (rsp) -> {
|
||||
|
||||
var entity = rsp.getEntity();
|
||||
|
||||
try {
|
||||
|
||||
Header[] rawHeaders = rsp.getHeaders();
|
||||
Map<String, List<String>> headers = new HashMap<>(rawHeaders.length);
|
||||
for (Header header : rawHeaders) {
|
||||
headers.computeIfAbsent(header.getName(), k -> new ArrayList<>())
|
||||
.add(header.getValue());
|
||||
}
|
||||
|
||||
if (method == Method.GET && entity == null) {
|
||||
return new ProtocolError("GET request returned no content");
|
||||
}
|
||||
|
||||
byte[] body = entity != null ? EntityUtils.toByteArray(entity) : null;
|
||||
|
||||
Duration responseTime = Duration.between(start, Instant.now());
|
||||
|
||||
return PingRequestResponse.of(
|
||||
rsp.getVersion(),
|
||||
rsp.getCode(),
|
||||
body,
|
||||
headers,
|
||||
responseTime,
|
||||
context.getSSLSession()
|
||||
);
|
||||
} finally {
|
||||
EntityUtils.consume(entity);
|
||||
}
|
||||
});
|
||||
} catch (SocketTimeoutException ex) {
|
||||
return new TimeoutResponse(ex.getMessage());
|
||||
} catch (HttpHostConnectException | SSLHandshakeException e) {
|
||||
return new ConnectionError(e.getClass().getSimpleName());
|
||||
} catch (IOException e) {
|
||||
return new ProtocolError(e.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record ConnectionError(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,18 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public record Headers(Map<String, List<String>> headers) {
|
||||
public List<String> get(String name) {
|
||||
return headers.getOrDefault(name, List.of());
|
||||
}
|
||||
|
||||
public String getFirst(String name) {
|
||||
return headers.getOrDefault(name, List.of()).stream().findFirst().orElse(null);
|
||||
}
|
||||
|
||||
public boolean contains(String name) {
|
||||
return headers.containsKey(name);
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
public record HttpResponse(
|
||||
String version,
|
||||
int httpStatus,
|
||||
byte[] body,
|
||||
Headers headers,
|
||||
Duration httpResponseTime
|
||||
) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,15 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.security.cert.Certificate;
|
||||
import java.time.Duration;
|
||||
|
||||
public record HttpsResponse(
|
||||
String version,
|
||||
int httpStatus,
|
||||
byte[] body,
|
||||
Headers headers,
|
||||
Certificate[] sslCertificates,
|
||||
SslMetadata sslMetadata,
|
||||
Duration httpResponseTime
|
||||
) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public enum Method {
|
||||
GET, HEAD
|
||||
}
|
@@ -0,0 +1,22 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import org.apache.hc.core5.http.ProtocolVersion;
|
||||
|
||||
import javax.net.ssl.SSLPeerUnverifiedException;
|
||||
import javax.net.ssl.SSLSession;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public sealed interface PingRequestResponse
|
||||
permits HttpResponse, HttpsResponse, TimeoutResponse, ConnectionError, ProtocolError, UnknownHostError {
|
||||
static PingRequestResponse of(ProtocolVersion version, int httpStatus, byte[] body, Map<String, List<String>> headers, Duration time, SSLSession sslSession) throws SSLPeerUnverifiedException {
|
||||
|
||||
if (sslSession == null) {
|
||||
return new HttpResponse(version.toString(), httpStatus, body, new Headers(headers), time);
|
||||
} else {
|
||||
return new HttpsResponse(version.toString(), httpStatus, body, new Headers(headers), sslSession.getPeerCertificates(), new SslMetadata(sslSession), time);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record ProtocolError(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,14 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import javax.net.ssl.SSLSession;
|
||||
|
||||
public record SslMetadata(
|
||||
String cipherSuite,
|
||||
String protocol) {
|
||||
public SslMetadata(SSLSession session) {
|
||||
this(
|
||||
session.getCipherSuite(),
|
||||
session.getProtocol()
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record TimeoutResponse(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record UnknownHostError() implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,165 @@
|
||||
package nu.marginalia.ping.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||
import org.apache.hc.client5.http.ssl.NoopHostnameVerifier;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.ssl.SSLContextBuilder;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
// No-op up front validation of server certificates.
|
||||
//
|
||||
// We will validate certificates later, after the connection is established
|
||||
// as we want to store the certificate chain and validation
|
||||
// outcome to the database.
|
||||
|
||||
var trustMeBro = new X509TrustManager() {
|
||||
private X509Certificate[] lastServerCertChain;
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) {
|
||||
this.lastServerCertChain = chain.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return new X509Certificate[0];
|
||||
}
|
||||
|
||||
public X509Certificate[] getLastServerCertChain() {
|
||||
return lastServerCertChain != null ? lastServerCertChain.clone() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SSLContext sslContext = SSLContextBuilder.create().build();
|
||||
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(
|
||||
new DefaultClientTlsStrategy(sslContext, NoopHostnameVerifier.INSTANCE))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.ping.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum AvailabilityOutageType {
|
||||
NONE,
|
||||
TIMEOUT,
|
||||
SSL_ERROR,
|
||||
DNS_ERROR,
|
||||
CONNECTION_ERROR,
|
||||
HTTP_CLIENT_ERROR,
|
||||
HTTP_SERVER_ERROR,
|
||||
UNKNOWN;
|
||||
|
||||
public static AvailabilityOutageType fromErrorClassification(ErrorClassification errorClassification) {
|
||||
if (null == errorClassification) {
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
return switch (errorClassification) {
|
||||
case NONE -> NONE;
|
||||
case TIMEOUT -> TIMEOUT;
|
||||
case SSL_ERROR -> SSL_ERROR;
|
||||
case DNS_ERROR -> DNS_ERROR;
|
||||
case CONNECTION_ERROR -> CONNECTION_ERROR;
|
||||
case HTTP_CLIENT_ERROR -> HTTP_CLIENT_ERROR;
|
||||
case HTTP_SERVER_ERROR -> HTTP_SERVER_ERROR;
|
||||
case UNKNOWN -> UNKNOWN;
|
||||
};
|
||||
}
|
||||
}
|
@@ -0,0 +1,49 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainAvailabilityEvent(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
boolean available,
|
||||
AvailabilityOutageType outageType, // e.g., 'TIMEOUT', 'DNS_ERROR', etc.
|
||||
Integer httpStatusCode, // Nullable, as it may not always be applicable
|
||||
String errorMessage, // Specific error details
|
||||
Instant tsUpdate // Timestamp of the last update
|
||||
) implements WritableModel {
|
||||
|
||||
@Override
|
||||
public void write(Connection conn) throws SQLException {
|
||||
try (var ps = conn.prepareStatement("""
|
||||
INSERT INTO DOMAIN_AVAILABILITY_EVENTS (
|
||||
domain_id,
|
||||
node_id,
|
||||
available,
|
||||
outage_type,
|
||||
http_status_code,
|
||||
error_message,
|
||||
ts_change
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
"""))
|
||||
{
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setBoolean(3, available());
|
||||
ps.setString(4, outageType().name());
|
||||
if (httpStatusCode() == null) {
|
||||
ps.setNull(5, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(5, httpStatusCode());
|
||||
}
|
||||
if (errorMessage() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, errorMessage());
|
||||
}
|
||||
ps.setTimestamp(7, java.sql.Timestamp.from(tsUpdate()));
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,363 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainAvailabilityRecord(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
boolean serverAvailable,
|
||||
@Nullable byte[] serverIp,
|
||||
@Nullable Integer asn,
|
||||
@Nullable Long dataHash,
|
||||
@Nullable Long securityConfigHash,
|
||||
@Nullable HttpSchema httpSchema,
|
||||
@Nullable String httpEtag,
|
||||
@Nullable String httpLastModified,
|
||||
@Nullable Integer httpStatus,
|
||||
@Nullable String httpLocation,
|
||||
@Nullable Duration httpResponseTime,
|
||||
@Nullable ErrorClassification errorClassification,
|
||||
@Nullable String errorMessage,
|
||||
|
||||
@Nullable Instant tsLastPing,
|
||||
@Nullable Instant tsLastAvailable,
|
||||
@Nullable Instant tsLastError,
|
||||
|
||||
Instant nextScheduledUpdate,
|
||||
int backoffConsecutiveFailures,
|
||||
Duration backoffFetchInterval
|
||||
)
|
||||
implements WritableModel
|
||||
{
|
||||
public DomainAvailabilityRecord(ResultSet rs) throws SQLException {
|
||||
this(
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID"),
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.NODE_ID"),
|
||||
rs.getBoolean("DOMAIN_AVAILABILITY_INFORMATION.SERVER_AVAILABLE"),
|
||||
rs.getBytes("DOMAIN_AVAILABILITY_INFORMATION.SERVER_IP"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.SERVER_IP_ASN", Integer.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.DATA_HASH", Long.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.SECURITY_CONFIG_HASH", Long.class),
|
||||
httpSchemaFromString(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_SCHEMA", String.class)),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_ETAG"),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_LAST_MODIFIED"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_STATUS", Integer.class),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_LOCATION"),
|
||||
durationFromMillis(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_RESPONSE_TIME_MS", Integer.class)),
|
||||
errorClassificationFromString(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.ERROR_CLASSIFICATION", String.class)),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.ERROR_MESSAGE"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_PING", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_AVAILABLE", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_ERROR", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.NEXT_SCHEDULED_UPDATE", Instant.class),
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.BACKOFF_CONSECUTIVE_FAILURES"),
|
||||
Duration.ofSeconds(rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.BACKOFF_FETCH_INTERVAL"))
|
||||
);
|
||||
}
|
||||
|
||||
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
|
||||
return schema == null ? null : HttpSchema.valueOf(schema);
|
||||
}
|
||||
private static ErrorClassification errorClassificationFromString(@Nullable String classification) {
|
||||
return classification == null ? null : ErrorClassification.valueOf(classification);
|
||||
}
|
||||
private static Duration durationFromMillis(@Nullable Integer millis) {
|
||||
return millis == null ? null : Duration.ofMillis(millis);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Instant nextUpdateTime() {
|
||||
return nextScheduledUpdate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement(
|
||||
"""
|
||||
REPLACE INTO DOMAIN_AVAILABILITY_INFORMATION (
|
||||
domain_id,
|
||||
node_id,
|
||||
server_available,
|
||||
server_ip,
|
||||
data_hash,
|
||||
security_config_hash,
|
||||
http_schema,
|
||||
http_etag,
|
||||
http_last_modified,
|
||||
http_status,
|
||||
http_location,
|
||||
http_response_time_ms,
|
||||
error_classification,
|
||||
error_message,
|
||||
ts_last_ping,
|
||||
ts_last_available,
|
||||
ts_last_error,
|
||||
next_scheduled_update,
|
||||
backoff_consecutive_failures,
|
||||
backoff_fetch_interval,
|
||||
server_ip_asn)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)
|
||||
""")) {
|
||||
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setBoolean(3, serverAvailable());
|
||||
if (serverIp() == null) {
|
||||
ps.setNull(4, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(4, serverIp());
|
||||
}
|
||||
if (dataHash() == null) {
|
||||
ps.setNull(5, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(5, dataHash());
|
||||
}
|
||||
if (securityConfigHash() == null) {
|
||||
ps.setNull(6, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(6, securityConfigHash());
|
||||
}
|
||||
if (httpSchema() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, httpSchema().name());
|
||||
}
|
||||
if (httpEtag() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, httpEtag());
|
||||
}
|
||||
if (httpLastModified() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, httpLastModified());
|
||||
}
|
||||
if (httpStatus() == null) {
|
||||
ps.setNull(10, java.sql.Types.INTEGER);
|
||||
}
|
||||
else {
|
||||
ps.setInt(10, httpStatus());
|
||||
}
|
||||
if (httpLocation() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(11, httpLocation());
|
||||
}
|
||||
|
||||
if (httpResponseTime() == null) {
|
||||
ps.setNull(12, java.sql.Types.SMALLINT);
|
||||
}
|
||||
else {
|
||||
ps.setInt(12, Math.clamp(httpResponseTime().toMillis(), 0, 0xFFFF)); // "unsigned short" in SQL
|
||||
}
|
||||
|
||||
if (errorClassification() == null) {
|
||||
ps.setNull(13, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(13, errorClassification().name());
|
||||
}
|
||||
|
||||
if (errorMessage() == null) {
|
||||
ps.setNull(14, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(14, errorMessage());
|
||||
}
|
||||
|
||||
ps.setTimestamp(15, java.sql.Timestamp.from(tsLastPing()));
|
||||
|
||||
if (tsLastAvailable() == null) {
|
||||
ps.setNull(16, java.sql.Types.TIMESTAMP);
|
||||
}
|
||||
else {
|
||||
ps.setTimestamp(16, java.sql.Timestamp.from(tsLastAvailable()));
|
||||
}
|
||||
if (tsLastError() == null) {
|
||||
ps.setNull(17, java.sql.Types.TIMESTAMP);
|
||||
}
|
||||
else {
|
||||
ps.setTimestamp(17, java.sql.Timestamp.from(tsLastError()));
|
||||
}
|
||||
|
||||
ps.setTimestamp(18, java.sql.Timestamp.from(nextScheduledUpdate()));
|
||||
ps.setInt(19, backoffConsecutiveFailures());
|
||||
ps.setInt(20, (int) backoffFetchInterval().getSeconds());
|
||||
|
||||
if (asn() == null) {
|
||||
ps.setNull(21, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(21, asn());
|
||||
}
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private int domainId;
|
||||
private int nodeId;
|
||||
private boolean serverAvailable;
|
||||
private byte[] serverIp;
|
||||
private Integer serverIpAsn;
|
||||
private Long dataHash;
|
||||
private Long securityConfigHash;
|
||||
private HttpSchema httpSchema;
|
||||
private String httpEtag;
|
||||
private String httpLastModified;
|
||||
private Integer httpStatus;
|
||||
private String httpLocation;
|
||||
private Duration httpResponseTime;
|
||||
private ErrorClassification errorClassification;
|
||||
private String errorMessage;
|
||||
private Instant tsLastPing;
|
||||
private Instant tsLastAvailable;
|
||||
private Instant tsLastError;
|
||||
private Instant nextScheduledUpdate;
|
||||
private int backoffConsecutiveFailures;
|
||||
private Duration backoffFetchInterval;
|
||||
|
||||
public Builder domainId(int domainId) {
|
||||
this.domainId = domainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeId(int nodeId) {
|
||||
this.nodeId = nodeId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverAvailable(boolean serverAvailable) {
|
||||
this.serverAvailable = serverAvailable;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverIp(byte[] serverIp) {
|
||||
this.serverIp = serverIp;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverIpAsn(Integer asn) {
|
||||
this.serverIpAsn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder dataHash(Long dataHash) {
|
||||
this.dataHash = dataHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder securityConfigHash(Long securityConfigHash) {
|
||||
this.securityConfigHash = securityConfigHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpSchema(HttpSchema httpSchema) {
|
||||
this.httpSchema = httpSchema;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpEtag(String httpEtag) {
|
||||
this.httpEtag = httpEtag;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpLastModified(String httpLastModified) {
|
||||
this.httpLastModified = httpLastModified;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpStatus(Integer httpStatus) {
|
||||
this.httpStatus = httpStatus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpLocation(String httpLocation) {
|
||||
this.httpLocation = StringUtils.abbreviate(httpLocation, "...",255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpResponseTime(Duration httpResponseTime) {
|
||||
this.httpResponseTime = httpResponseTime;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder errorClassification(ErrorClassification errorClassification) {
|
||||
this.errorClassification = errorClassification;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder errorMessage(String errorMessage) {
|
||||
this.errorMessage = errorMessage;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastPing(Instant tsLastPing) {
|
||||
this.tsLastPing = tsLastPing;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastAvailable(Instant tsLastAvailable) {
|
||||
this.tsLastAvailable = tsLastAvailable;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastError(Instant tsLastError) {
|
||||
this.tsLastError = tsLastError;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nextScheduledUpdate(Instant nextScheduledUpdate) {
|
||||
this.nextScheduledUpdate = nextScheduledUpdate;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder backoffConsecutiveFailures(int backoffConsecutiveFailures) {
|
||||
this.backoffConsecutiveFailures = backoffConsecutiveFailures;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder backoffFetchInterval(Duration backoffFetchInterval) {
|
||||
this.backoffFetchInterval = backoffFetchInterval;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainAvailabilityRecord build() {
|
||||
return new DomainAvailabilityRecord(
|
||||
domainId,
|
||||
nodeId,
|
||||
serverAvailable,
|
||||
serverIp,
|
||||
serverIpAsn,
|
||||
dataHash,
|
||||
securityConfigHash,
|
||||
httpSchema,
|
||||
httpEtag,
|
||||
httpLastModified,
|
||||
httpStatus,
|
||||
httpLocation,
|
||||
httpResponseTime,
|
||||
errorClassification,
|
||||
errorMessage,
|
||||
tsLastPing,
|
||||
tsLastAvailable,
|
||||
tsLastError,
|
||||
nextScheduledUpdate,
|
||||
backoffConsecutiveFailures,
|
||||
backoffFetchInterval
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
}
|
@@ -0,0 +1,369 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public record DomainDnsRecord(
|
||||
Integer dnsRootDomainId,
|
||||
String rootDomainName,
|
||||
int nodeAffinity,
|
||||
@Nullable List<String> aRecords,
|
||||
@Nullable List<String> aaaaRecords,
|
||||
@Nullable String cnameRecord,
|
||||
@Nullable List<String> mxRecords,
|
||||
@Nullable List<String> caaRecords,
|
||||
@Nullable List<String> txtRecords,
|
||||
@Nullable List<String> nsRecords,
|
||||
@Nullable String soaRecord,
|
||||
Instant tsLastUpdate,
|
||||
Instant tsNextScheduledUpdate,
|
||||
int dnsCheckPriority)
|
||||
implements WritableModel
|
||||
{
|
||||
private static Gson gson = GsonFactory.get();
|
||||
|
||||
public DomainDnsRecord(ResultSet rs) throws SQLException {
|
||||
this(
|
||||
rs.getObject("DNS_ROOT_DOMAIN_ID", Integer.class),
|
||||
rs.getString("ROOT_DOMAIN_NAME"),
|
||||
rs.getInt("NODE_AFFINITY"),
|
||||
deserializeJsonArray(rs.getString("DNS_A_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_AAAA_RECORDS")),
|
||||
rs.getString("DNS_CNAME_RECORD"),
|
||||
deserializeJsonArray(rs.getString("DNS_MX_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_CAA_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_TXT_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_NS_RECORDS")),
|
||||
rs.getString("DNS_SOA_RECORD"),
|
||||
rs.getObject("TS_LAST_DNS_CHECK", Instant.class),
|
||||
rs.getObject("TS_NEXT_DNS_CHECK", Instant.class),
|
||||
rs.getInt("DNS_CHECK_PRIORITY")
|
||||
);
|
||||
}
|
||||
|
||||
static List<String> deserializeJsonArray(@Nullable String json) {
|
||||
if (json == null || json.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
return gson.fromJson(json, List.class);
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Instant nextUpdateTime() {
|
||||
return tsNextScheduledUpdate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
|
||||
if (dnsRootDomainId() != null) {
|
||||
update(connection);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var ps = connection.prepareStatement("""
|
||||
REPLACE INTO DOMAIN_DNS_INFORMATION (
|
||||
ROOT_DOMAIN_NAME,
|
||||
NODE_AFFINITY,
|
||||
DNS_A_RECORDS,
|
||||
DNS_AAAA_RECORDS,
|
||||
DNS_CNAME_RECORD,
|
||||
DNS_MX_RECORDS,
|
||||
DNS_CAA_RECORDS,
|
||||
DNS_TXT_RECORDS,
|
||||
DNS_NS_RECORDS,
|
||||
DNS_SOA_RECORD,
|
||||
TS_LAST_DNS_CHECK,
|
||||
TS_NEXT_DNS_CHECK,
|
||||
DNS_CHECK_PRIORITY
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""")) {
|
||||
|
||||
ps.setString(1, rootDomainName());
|
||||
ps.setInt(2, nodeAffinity());
|
||||
|
||||
if (aRecords() == null) {
|
||||
ps.setNull(3, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(3, gson.toJson(aRecords()));
|
||||
}
|
||||
if (aaaaRecords() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, gson.toJson(aaaaRecords()));
|
||||
}
|
||||
if (cnameRecord() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, cnameRecord());
|
||||
}
|
||||
if (mxRecords() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, gson.toJson(mxRecords()));
|
||||
}
|
||||
if (caaRecords() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, gson.toJson(caaRecords()));
|
||||
}
|
||||
if (txtRecords() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, gson.toJson(txtRecords()));
|
||||
}
|
||||
if (nsRecords() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, gson.toJson(nsRecords()));
|
||||
}
|
||||
if (soaRecord() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, soaRecord());
|
||||
}
|
||||
ps.setString(10, soaRecord());
|
||||
ps.setTimestamp(11, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
ps.setTimestamp(12, java.sql.Timestamp.from(tsNextScheduledUpdate()));
|
||||
ps.setInt(13, dnsCheckPriority());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void update(Connection connection) throws SQLException {
|
||||
|
||||
try (var ps = connection.prepareStatement("""
|
||||
REPLACE INTO DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID,
|
||||
ROOT_DOMAIN_NAME,
|
||||
NODE_AFFINITY,
|
||||
DNS_A_RECORDS,
|
||||
DNS_AAAA_RECORDS,
|
||||
DNS_CNAME_RECORD,
|
||||
DNS_MX_RECORDS,
|
||||
DNS_CAA_RECORDS,
|
||||
DNS_TXT_RECORDS,
|
||||
DNS_NS_RECORDS,
|
||||
DNS_SOA_RECORD,
|
||||
TS_LAST_DNS_CHECK,
|
||||
TS_NEXT_DNS_CHECK,
|
||||
DNS_CHECK_PRIORITY
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""")) {
|
||||
|
||||
ps.setObject(1, dnsRootDomainId(), java.sql.Types.INTEGER);
|
||||
ps.setString(2, rootDomainName());
|
||||
ps.setInt(3, nodeAffinity());
|
||||
|
||||
if (aRecords() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, gson.toJson(aRecords()));
|
||||
}
|
||||
if (aaaaRecords() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, gson.toJson(aaaaRecords()));
|
||||
}
|
||||
if (cnameRecord() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, cnameRecord());
|
||||
}
|
||||
if (mxRecords() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, gson.toJson(mxRecords()));
|
||||
}
|
||||
if (caaRecords() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, gson.toJson(caaRecords()));
|
||||
}
|
||||
if (txtRecords() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, gson.toJson(txtRecords()));
|
||||
}
|
||||
if (nsRecords() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, gson.toJson(nsRecords()));
|
||||
}
|
||||
if (soaRecord() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(11, soaRecord());
|
||||
}
|
||||
ps.setTimestamp(12, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
ps.setTimestamp(13, java.sql.Timestamp.from(tsNextScheduledUpdate()));
|
||||
ps.setInt(14, dnsCheckPriority());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private Integer dnsRootDomainId;
|
||||
private String rootDomainName;
|
||||
private int nodeAffinity;
|
||||
private List<String> aRecords;
|
||||
private List<String> aaaaRecords;
|
||||
private String cnameRecord;
|
||||
private List<String> mxRecords;
|
||||
private List<String> caaRecords;
|
||||
private List<String> txtRecords;
|
||||
private List<String> nsRecords;
|
||||
private String soaRecord;
|
||||
private Instant tsLastUpdate;
|
||||
private Instant tsNextScheduledUpdate;
|
||||
private int dnsCheckPriority;
|
||||
|
||||
public Builder dnsRootDomainId(Integer dnsRootDomainId) {
|
||||
this.dnsRootDomainId = dnsRootDomainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder rootDomainName(String rootDomainName) {
|
||||
this.rootDomainName = rootDomainName;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeAffinity(int nodeAffinity) {
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addARecord(String aRecord) {
|
||||
if (this.aRecords == null) {
|
||||
this.aRecords = new ArrayList<>();
|
||||
}
|
||||
this.aRecords.add(aRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder aRecords(List<String> aRecords) {
|
||||
this.aRecords = aRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addAaaaRecord(String aaaaRecord) {
|
||||
if (this.aaaaRecords == null) {
|
||||
this.aaaaRecords = new ArrayList<>();
|
||||
}
|
||||
this.aaaaRecords.add(aaaaRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder aaaaRecords(List<String> aaaaRecords) {
|
||||
this.aaaaRecords = aaaaRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder cnameRecord(String cnameRecord) {
|
||||
this.cnameRecord = cnameRecord;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addMxRecord(String mxRecord) {
|
||||
if (this.mxRecords == null) {
|
||||
this.mxRecords = new ArrayList<>();
|
||||
}
|
||||
this.mxRecords.add(mxRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder mxRecords(List<String> mxRecords) {
|
||||
this.mxRecords = mxRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addCaaRecord(String caaRecord) {
|
||||
if (this.caaRecords == null) {
|
||||
this.caaRecords = new ArrayList<>();
|
||||
}
|
||||
this.caaRecords.add(caaRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder caaRecords(List<String> caaRecords) {
|
||||
this.caaRecords = caaRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addTxtRecord(String txtRecord) {
|
||||
if (this.txtRecords == null) {
|
||||
this.txtRecords = new ArrayList<>();
|
||||
}
|
||||
this.txtRecords.add(txtRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder txtRecords(List<String> txtRecords) {
|
||||
this.txtRecords = txtRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addNsRecord(String nsRecord) {
|
||||
if (this.nsRecords == null) {
|
||||
this.nsRecords = new ArrayList<>();
|
||||
}
|
||||
this.nsRecords.add(nsRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nsRecords(List<String> nsRecords) {
|
||||
this.nsRecords = nsRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder soaRecord(String soaRecord) {
|
||||
this.soaRecord = soaRecord;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastUpdate(Instant tsLastUpdate) {
|
||||
this.tsLastUpdate = tsLastUpdate;
|
||||
return this;
|
||||
}
|
||||
public Builder tsNextScheduledUpdate(Instant nextScheduledUpdate) {
|
||||
this.tsNextScheduledUpdate = nextScheduledUpdate;
|
||||
return this;
|
||||
}
|
||||
public Builder dnsCheckPriority(int dnsCheckPriority) {
|
||||
this.dnsCheckPriority = dnsCheckPriority;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainDnsRecord build() {
|
||||
return new DomainDnsRecord(
|
||||
dnsRootDomainId,
|
||||
rootDomainName,
|
||||
nodeAffinity,
|
||||
aRecords,
|
||||
aaaaRecords,
|
||||
cnameRecord,
|
||||
mxRecords,
|
||||
caaRecords,
|
||||
txtRecords,
|
||||
nsRecords,
|
||||
soaRecord,
|
||||
tsLastUpdate,
|
||||
tsNextScheduledUpdate,
|
||||
dnsCheckPriority
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public record DomainReference(int domainId, int nodeId, String domainName) {
|
||||
public EdgeDomain asEdgeDomain() {
|
||||
return new EdgeDomain(domainName);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,91 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import nu.marginalia.ping.util.JsonObject;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainSecurityEvent(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
Instant tsChange,
|
||||
boolean asnChanged,
|
||||
boolean certificateFingerprintChanged,
|
||||
boolean certificateProfileChanged,
|
||||
boolean certificateSanChanged,
|
||||
boolean certificatePublicKeyChanged,
|
||||
boolean certificateSerialNumberChanged,
|
||||
boolean certificateIssuerChanged,
|
||||
SchemaChange schemaChange,
|
||||
Duration oldCertificateTimeToExpiry,
|
||||
boolean securityHeadersChanged,
|
||||
boolean ipChanged,
|
||||
boolean softwareChanged,
|
||||
JsonObject<DomainSecurityRecord> securitySignatureBefore,
|
||||
JsonObject<DomainSecurityRecord> securitySignatureAfter
|
||||
) implements WritableModel {
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement("""
|
||||
INSERT INTO DOMAIN_SECURITY_EVENTS (
|
||||
domain_id,
|
||||
node_id,
|
||||
ts_change,
|
||||
change_asn,
|
||||
change_certificate_fingerprint,
|
||||
change_certificate_profile,
|
||||
change_certificate_san,
|
||||
change_certificate_public_key,
|
||||
change_security_headers,
|
||||
change_ip_address,
|
||||
change_software,
|
||||
old_cert_time_to_expiry,
|
||||
security_signature_before,
|
||||
security_signature_after,
|
||||
change_certificate_serial_number,
|
||||
change_certificate_issuer,
|
||||
change_schema
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setTimestamp(3, java.sql.Timestamp.from(tsChange()));
|
||||
ps.setBoolean(4, asnChanged());
|
||||
ps.setBoolean(5, certificateFingerprintChanged());
|
||||
ps.setBoolean(6, certificateProfileChanged());
|
||||
ps.setBoolean(7, certificateSanChanged());
|
||||
ps.setBoolean(8, certificatePublicKeyChanged());
|
||||
ps.setBoolean(9, securityHeadersChanged());
|
||||
ps.setBoolean(10, ipChanged());
|
||||
ps.setBoolean(11, softwareChanged());
|
||||
|
||||
if (oldCertificateTimeToExpiry() == null) {
|
||||
ps.setNull(12, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(12, oldCertificateTimeToExpiry().toHours());
|
||||
}
|
||||
|
||||
if (securitySignatureBefore() == null) {
|
||||
ps.setNull(13, java.sql.Types.BLOB);
|
||||
} else {
|
||||
ps.setBytes(13, securitySignatureBefore().compressed());
|
||||
}
|
||||
if (securitySignatureAfter() == null) {
|
||||
ps.setNull(14, java.sql.Types.BLOB);
|
||||
} else {
|
||||
ps.setBytes(14, securitySignatureAfter().compressed());
|
||||
}
|
||||
|
||||
ps.setBoolean(15, certificateSerialNumberChanged());
|
||||
ps.setBoolean(16, certificateIssuerChanged());
|
||||
ps.setString(17, schemaChange.name());
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,604 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.time.Instant;
|
||||
import java.util.Objects;
|
||||
|
||||
public record DomainSecurityRecord(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
@Nullable Integer asn,
|
||||
@Nullable HttpSchema httpSchema,
|
||||
@Nullable String httpVersion,
|
||||
@Nullable String httpCompression,
|
||||
@Nullable String httpCacheControl,
|
||||
@Nullable Instant sslCertNotBefore,
|
||||
@Nullable Instant sslCertNotAfter,
|
||||
@Nullable String sslCertIssuer,
|
||||
@Nullable String sslCertSubject,
|
||||
@Nullable byte[] sslCertPublicKeyHash,
|
||||
@Nullable String sslCertSerialNumber,
|
||||
@Nullable byte[] sslCertFingerprintSha256,
|
||||
@Nullable String sslCertSan,
|
||||
boolean sslCertWildcard,
|
||||
@Nullable String sslProtocol,
|
||||
@Nullable String sslCipherSuite,
|
||||
@Nullable String sslKeyExchange,
|
||||
@Nullable Integer sslCertificateChainLength,
|
||||
boolean sslCertificateValid,
|
||||
@Nullable String headerCorsAllowOrigin,
|
||||
boolean headerCorsAllowCredentials,
|
||||
@Nullable Integer headerContentSecurityPolicyHash,
|
||||
@Nullable String headerStrictTransportSecurity,
|
||||
@Nullable String headerReferrerPolicy,
|
||||
@Nullable String headerXFrameOptions,
|
||||
@Nullable String headerXContentTypeOptions,
|
||||
@Nullable String headerXXssProtection,
|
||||
@Nullable String headerServer,
|
||||
@Nullable String headerXPoweredBy,
|
||||
@Nullable Instant tsLastUpdate,
|
||||
@Nullable Boolean sslChainValid,
|
||||
@Nullable Boolean sslHostValid,
|
||||
@Nullable Boolean sslDateValid
|
||||
)
|
||||
implements WritableModel
|
||||
{
|
||||
|
||||
public int certificateProfileHash() {
|
||||
return Objects.hash(
|
||||
sslCertIssuer,
|
||||
sslCertSubject,
|
||||
sslCipherSuite,
|
||||
sslKeyExchange
|
||||
);
|
||||
}
|
||||
|
||||
public int securityHeadersHash() {
|
||||
return Objects.hash(
|
||||
headerCorsAllowOrigin,
|
||||
headerCorsAllowCredentials,
|
||||
headerContentSecurityPolicyHash,
|
||||
headerStrictTransportSecurity,
|
||||
headerReferrerPolicy,
|
||||
headerXFrameOptions,
|
||||
headerXContentTypeOptions,
|
||||
headerXXssProtection
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public DomainSecurityRecord(ResultSet rs) throws SQLException {
|
||||
this(rs.getInt("DOMAIN_SECURITY_INFORMATION.DOMAIN_ID"),
|
||||
rs.getInt("DOMAIN_SECURITY_INFORMATION.NODE_ID"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.ASN", Integer.class),
|
||||
httpSchemaFromString(rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_SCHEMA")),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_VERSION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_COMPRESSION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_CACHE_CONTROL"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERT_NOT_BEFORE", Instant.class),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERT_NOT_AFTER", Instant.class),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_ISSUER"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SUBJECT"),
|
||||
rs.getBytes("DOMAIN_SECURITY_INFORMATION.SSL_CERT_PUBLIC_KEY_HASH"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SERIAL_NUMBER"),
|
||||
rs.getBytes("DOMAIN_SECURITY_INFORMATION.SSL_CERT_FINGERPRINT_SHA256"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SAN"),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.SSL_CERT_WILDCARD"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_PROTOCOL"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CIPHER_SUITE"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_KEY_EXCHANGE"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERTIFICATE_CHAIN_LENGTH", Integer.class),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.SSL_CERTIFICATE_VALID"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_CORS_ALLOW_ORIGIN"),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.HEADER_CORS_ALLOW_CREDENTIALS"),
|
||||
rs.getInt("DOMAIN_SECURITY_INFORMATION.HEADER_CONTENT_SECURITY_POLICY_HASH"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_STRICT_TRANSPORT_SECURITY"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_REFERRER_POLICY"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_FRAME_OPTIONS"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_CONTENT_TYPE_OPTIONS"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_XSS_PROTECTION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_SERVER"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_POWERED_BY"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class),
|
||||
rs.getObject("SSL_CHAIN_VALID", Boolean.class),
|
||||
rs.getObject("SSL_HOST_VALID", Boolean.class),
|
||||
rs.getObject("SSL_DATE_VALID", Boolean.class)
|
||||
);
|
||||
}
|
||||
|
||||
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
|
||||
return schema == null ? null : HttpSchema.valueOf(schema);
|
||||
}
|
||||
|
||||
private static SslCertRevocationStatus sslCertRevocationStatusFromString(@Nullable String status) {
|
||||
return status == null ? null : SslCertRevocationStatus.valueOf(status);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement(
|
||||
"""
|
||||
REPLACE INTO DOMAIN_SECURITY_INFORMATION (
|
||||
domain_id,
|
||||
node_id,
|
||||
http_schema,
|
||||
http_version,
|
||||
http_compression,
|
||||
http_cache_control,
|
||||
ssl_cert_not_before,
|
||||
ssl_cert_not_after,
|
||||
ssl_cert_issuer,
|
||||
ssl_cert_subject,
|
||||
ssl_cert_serial_number,
|
||||
ssl_cert_fingerprint_sha256,
|
||||
ssl_cert_san,
|
||||
ssl_cert_wildcard,
|
||||
ssl_protocol,
|
||||
ssl_cipher_suite,
|
||||
ssl_key_exchange,
|
||||
ssl_certificate_chain_length,
|
||||
ssl_certificate_valid,
|
||||
header_cors_allow_origin,
|
||||
header_cors_allow_credentials,
|
||||
header_content_security_policy_hash,
|
||||
header_strict_transport_security,
|
||||
header_referrer_policy,
|
||||
header_x_frame_options,
|
||||
header_x_content_type_options,
|
||||
header_x_xss_protection,
|
||||
header_server,
|
||||
header_x_powered_by,
|
||||
ssl_cert_public_key_hash,
|
||||
asn,
|
||||
ts_last_update,
|
||||
ssl_chain_valid,
|
||||
ssl_host_valid,
|
||||
ssl_date_valid)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
if (httpSchema() == null) {
|
||||
ps.setNull(3, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(3, httpSchema().name());
|
||||
}
|
||||
if (httpVersion() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, httpVersion());
|
||||
}
|
||||
if (httpCompression() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, httpCompression());
|
||||
}
|
||||
if (httpCacheControl() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, httpCacheControl());
|
||||
}
|
||||
if (sslCertNotBefore() == null) {
|
||||
ps.setNull(7, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(7, java.sql.Timestamp.from(sslCertNotBefore()));
|
||||
}
|
||||
if (sslCertNotAfter() == null) {
|
||||
ps.setNull(8, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(8, java.sql.Timestamp.from(sslCertNotAfter()));
|
||||
}
|
||||
if (sslCertIssuer() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, sslCertIssuer());
|
||||
}
|
||||
if (sslCertSubject() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, sslCertSubject());
|
||||
}
|
||||
if (sslCertSerialNumber() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(11, sslCertSerialNumber());
|
||||
}
|
||||
if (sslCertFingerprintSha256() == null) {
|
||||
ps.setNull(12, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(12, sslCertFingerprintSha256());
|
||||
}
|
||||
if (sslCertSan() == null) {
|
||||
ps.setNull(13, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(13, sslCertSan());
|
||||
}
|
||||
ps.setBoolean(14, sslCertWildcard());
|
||||
if (sslProtocol() == null) {
|
||||
ps.setNull(15, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(15, sslProtocol());
|
||||
}
|
||||
if (sslCipherSuite() == null) {
|
||||
ps.setNull(16, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(16, sslCipherSuite());
|
||||
}
|
||||
if (sslKeyExchange() == null) {
|
||||
ps.setNull(17, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(17, sslKeyExchange());
|
||||
}
|
||||
if (sslCertificateChainLength() == null) {
|
||||
ps.setNull(18, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(18, sslCertificateChainLength());
|
||||
}
|
||||
ps.setBoolean(19, sslCertificateValid());
|
||||
if (headerCorsAllowOrigin() == null) {
|
||||
ps.setNull(20, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(20, headerCorsAllowOrigin());
|
||||
}
|
||||
ps.setBoolean(21, headerCorsAllowCredentials());
|
||||
if (headerContentSecurityPolicyHash() == null) {
|
||||
ps.setNull(22, Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(22, headerContentSecurityPolicyHash());
|
||||
}
|
||||
if (headerStrictTransportSecurity() == null) {
|
||||
ps.setNull(23, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(23, headerStrictTransportSecurity());
|
||||
}
|
||||
if (headerReferrerPolicy() == null) {
|
||||
ps.setNull(24, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(24, headerReferrerPolicy());
|
||||
}
|
||||
if (headerXFrameOptions() == null) {
|
||||
ps.setNull(25, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(25, headerXFrameOptions());
|
||||
}
|
||||
if (headerXContentTypeOptions() == null) {
|
||||
ps.setNull(26, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(26, headerXContentTypeOptions());
|
||||
}
|
||||
if (headerXXssProtection() == null) {
|
||||
ps.setNull(27, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(27, headerXXssProtection());
|
||||
}
|
||||
if (headerServer() == null) {
|
||||
ps.setNull(28, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(28, headerServer());
|
||||
}
|
||||
if (headerXPoweredBy() == null) {
|
||||
ps.setNull(29, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(29, headerXPoweredBy());
|
||||
}
|
||||
if (sslCertPublicKeyHash() == null) {
|
||||
ps.setNull(30, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(30, sslCertPublicKeyHash());
|
||||
}
|
||||
if (asn() == null) {
|
||||
ps.setNull(31, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(31, asn());
|
||||
}
|
||||
|
||||
if (tsLastUpdate() == null) {
|
||||
ps.setNull(32, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(32, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
}
|
||||
|
||||
if (sslChainValid() == null) {
|
||||
ps.setNull(33, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(33, sslChainValid());
|
||||
}
|
||||
|
||||
if (sslHostValid() == null) {
|
||||
ps.setNull(34, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(34, sslHostValid());
|
||||
}
|
||||
|
||||
if (sslDateValid() == null) {
|
||||
ps.setNull(35, java.sql.Types.BOOLEAN);
|
||||
} else {
|
||||
ps.setBoolean(35, sslDateValid());
|
||||
}
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private int domainId;
|
||||
private int nodeId;
|
||||
private Integer asn;
|
||||
private HttpSchema httpSchema;
|
||||
private String httpVersion;
|
||||
private String httpCompression;
|
||||
private String httpCacheControl;
|
||||
private Instant sslCertNotBefore;
|
||||
private Instant sslCertNotAfter;
|
||||
private String sslCertIssuer;
|
||||
private String sslCertSubject;
|
||||
private String sslCertSerialNumber;
|
||||
private byte[] sslCertPublicKeyHash;
|
||||
private byte[] sslCertFingerprintSha256;
|
||||
private String sslCertSan;
|
||||
private boolean sslCertWildcard;
|
||||
private String sslProtocol;
|
||||
private String sslCipherSuite;
|
||||
private String sslKeyExchange;
|
||||
private Integer sslCertificateChainLength;
|
||||
private boolean sslCertificateValid;
|
||||
private String headerCorsAllowOrigin;
|
||||
private boolean headerCorsAllowCredentials;
|
||||
private Integer headerContentSecurityPolicyHash;
|
||||
private String headerStrictTransportSecurity;
|
||||
private String headerReferrerPolicy;
|
||||
private String headerXFrameOptions;
|
||||
private String headerXContentTypeOptions;
|
||||
private String headerXXssProtection;
|
||||
private String headerServer;
|
||||
private String headerXPoweredBy;
|
||||
private Instant tsLastUpdate;
|
||||
|
||||
private Boolean isCertChainValid;
|
||||
private Boolean isCertHostValid;
|
||||
private Boolean isCertDateValid;
|
||||
|
||||
|
||||
private static Instant MAX_UNIX_TIMESTAMP = Instant.ofEpochSecond(Integer.MAX_VALUE);
|
||||
|
||||
public Builder() {
|
||||
// Default values for boolean fields
|
||||
this.sslCertWildcard = false;
|
||||
this.sslCertificateValid = false;
|
||||
this.headerCorsAllowCredentials = false;
|
||||
}
|
||||
|
||||
public Builder domainId(int domainId) {
|
||||
this.domainId = domainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeId(int nodeId) {
|
||||
this.nodeId = nodeId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder asn(@Nullable Integer asn) {
|
||||
this.asn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpSchema(HttpSchema httpSchema) {
|
||||
this.httpSchema = httpSchema;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpVersion(String httpVersion) {
|
||||
this.httpVersion = StringUtils.truncate(httpVersion, 10);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpCompression(String httpCompression) {
|
||||
this.httpCompression = StringUtils.truncate(httpCompression, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpCacheControl(String httpCacheControl) {
|
||||
this.httpCacheControl = httpCacheControl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotBefore(@NotNull Instant sslCertNotBefore) {
|
||||
if (sslCertNotBefore.isAfter(MAX_UNIX_TIMESTAMP)) {
|
||||
sslCertNotBefore = MAX_UNIX_TIMESTAMP;
|
||||
}
|
||||
this.sslCertNotBefore = sslCertNotBefore;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotAfter(@NotNull Instant sslCertNotAfter) {
|
||||
if (sslCertNotAfter.isAfter(MAX_UNIX_TIMESTAMP)) {
|
||||
sslCertNotAfter = MAX_UNIX_TIMESTAMP;
|
||||
}
|
||||
this.sslCertNotAfter = sslCertNotAfter;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertIssuer(String sslCertIssuer) {
|
||||
this.sslCertIssuer = StringUtils.truncate(sslCertIssuer, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSubject(String sslCertSubject) {
|
||||
this.sslCertSubject = StringUtils.truncate(sslCertSubject, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSerialNumber(String sslCertSerialNumber) {
|
||||
this.sslCertSerialNumber = sslCertSerialNumber;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertPublicKeyHash(byte[] sslCertPublicKeyHash) {
|
||||
this.sslCertPublicKeyHash = sslCertPublicKeyHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertFingerprintSha256(byte[] sslCertFingerprintSha256) {
|
||||
this.sslCertFingerprintSha256 = sslCertFingerprintSha256;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSan(String sslCertSan) {
|
||||
this.sslCertSan = sslCertSan;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertWildcard(boolean sslCertWildcard) {
|
||||
this.sslCertWildcard = sslCertWildcard;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslProtocol(String sslProtocol) {
|
||||
this.sslProtocol = sslProtocol;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCipherSuite(String sslCipherSuite) {
|
||||
this.sslCipherSuite = sslCipherSuite;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslKeyExchange(String sslKeyExchange) {
|
||||
this.sslKeyExchange = sslKeyExchange;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertificateChainLength(Integer sslCertificateChainLength) {
|
||||
this.sslCertificateChainLength = sslCertificateChainLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertificateValid(boolean sslCertificateValid) {
|
||||
this.sslCertificateValid = sslCertificateValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerCorsAllowOrigin(String headerCorsAllowOrigin) {
|
||||
this.headerCorsAllowOrigin = headerCorsAllowOrigin;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerCorsAllowCredentials(boolean headerCorsAllowCredentials) {
|
||||
this.headerCorsAllowCredentials = headerCorsAllowCredentials;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerContentSecurityPolicyHash(Integer headerContentSecurityPolicyHash) {
|
||||
this.headerContentSecurityPolicyHash = headerContentSecurityPolicyHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerStrictTransportSecurity(String headerStrictTransportSecurity) {
|
||||
this.headerStrictTransportSecurity = StringUtils.truncate(headerStrictTransportSecurity, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerReferrerPolicy(String headerReferrerPolicy) {
|
||||
this.headerReferrerPolicy = StringUtils.truncate(headerReferrerPolicy, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXFrameOptions(String headerXFrameOptions) {
|
||||
this.headerXFrameOptions = StringUtils.truncate(headerXFrameOptions, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXContentTypeOptions(String headerXContentTypeOptions) {
|
||||
this.headerXContentTypeOptions = StringUtils.truncate(headerXContentTypeOptions, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXXssProtection(String headerXXssProtection) {
|
||||
this.headerXXssProtection = StringUtils.truncate(headerXXssProtection, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerServer(String headerServer) {
|
||||
this.headerServer = StringUtils.truncate(headerServer, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXPoweredBy(String headerXPoweredBy) {
|
||||
this.headerXPoweredBy = StringUtils.truncate(headerXPoweredBy, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastUpdate(Instant tsLastUpdate) {
|
||||
this.tsLastUpdate = tsLastUpdate;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslChainValid(@Nullable Boolean isCertChainValid) {
|
||||
this.isCertChainValid = isCertChainValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslHostValid(@Nullable Boolean isCertHostValid) {
|
||||
this.isCertHostValid = isCertHostValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslDateValid(@Nullable Boolean isCertDateValid) {
|
||||
this.isCertDateValid = isCertDateValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainSecurityRecord build() {
|
||||
return new DomainSecurityRecord(
|
||||
domainId,
|
||||
nodeId,
|
||||
asn,
|
||||
httpSchema,
|
||||
httpVersion,
|
||||
httpCompression,
|
||||
httpCacheControl,
|
||||
sslCertNotBefore,
|
||||
sslCertNotAfter,
|
||||
sslCertIssuer,
|
||||
sslCertSubject,
|
||||
sslCertPublicKeyHash,
|
||||
sslCertSerialNumber,
|
||||
sslCertFingerprintSha256,
|
||||
sslCertSan,
|
||||
sslCertWildcard,
|
||||
sslProtocol,
|
||||
sslCipherSuite,
|
||||
sslKeyExchange,
|
||||
sslCertificateChainLength,
|
||||
sslCertificateValid,
|
||||
headerCorsAllowOrigin,
|
||||
headerCorsAllowCredentials,
|
||||
headerContentSecurityPolicyHash,
|
||||
headerStrictTransportSecurity,
|
||||
headerReferrerPolicy,
|
||||
headerXFrameOptions,
|
||||
headerXContentTypeOptions,
|
||||
headerXXssProtection,
|
||||
headerServer,
|
||||
headerXPoweredBy,
|
||||
tsLastUpdate,
|
||||
isCertChainValid,
|
||||
isCertHostValid,
|
||||
isCertDateValid
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum ErrorClassification {
|
||||
NONE,
|
||||
TIMEOUT,
|
||||
SSL_ERROR,
|
||||
DNS_ERROR,
|
||||
CONNECTION_ERROR,
|
||||
HTTP_CLIENT_ERROR,
|
||||
HTTP_SERVER_ERROR,
|
||||
UNKNOWN
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public sealed interface HistoricalAvailabilityData {
|
||||
public String domain();
|
||||
record JustDomainReference(DomainReference domainReference) implements HistoricalAvailabilityData {
|
||||
@Override
|
||||
public String domain() {
|
||||
return domainReference.domainName();
|
||||
}
|
||||
}
|
||||
record JustAvailability(String domain, DomainAvailabilityRecord record) implements HistoricalAvailabilityData {}
|
||||
record AvailabilityAndSecurity(String domain, DomainAvailabilityRecord availabilityRecord, DomainSecurityRecord securityRecord) implements HistoricalAvailabilityData {}
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum HttpSchema {
|
||||
HTTP,
|
||||
HTTPS;
|
||||
}
|
@@ -0,0 +1,6 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public sealed interface RootDomainReference {
|
||||
record ByIdAndName(long id, String name) implements RootDomainReference { }
|
||||
record ByName(String name) implements RootDomainReference { }
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum SchemaChange {
|
||||
UNKNOWN,
|
||||
NONE,
|
||||
HTTP_TO_HTTPS,
|
||||
HTTPS_TO_HTTP;
|
||||
|
||||
public boolean isSignificant() {
|
||||
return this != NONE && this != UNKNOWN;
|
||||
}
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public record SingleDnsRecord(
|
||||
String recordType,
|
||||
String data
|
||||
) {
|
||||
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum SslCertRevocationStatus {
|
||||
NOT_CHECKED,
|
||||
VALID,
|
||||
REVOKED,
|
||||
UNKNOWN
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user