mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
97 Commits
deploy-018
...
deploy-024
Author | SHA1 | Date | |
---|---|---|---|
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb |
16
ROADMAP.md
16
ROADMAP.md
@@ -38,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
||||
|
||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Custom ranking logic
|
||||
|
||||
Stract does an interesting thing where they have configurable search filters.
|
||||
@@ -66,6 +58,14 @@ One of the search engine's biggest limitations right now is that it does not ind
|
||||
|
||||
# Completed
|
||||
|
||||
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -1,3 +1,8 @@
|
||||
package nu.marginalia;
|
||||
|
||||
/**
|
||||
* A record representing a User Agent.
|
||||
* @param uaString - the header value of the User Agent
|
||||
* @param uaIdentifier - what we look for in robots.txt
|
||||
*/
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@@ -0,0 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||
ID INT NOT NULL AUTO_INCREMENT,
|
||||
TIER INT NOT NULL,
|
||||
PRIMARY KEY (ID)
|
||||
);
|
@@ -0,0 +1,213 @@
|
||||
|
||||
-- Create metadata tables for domain ping status and security information
|
||||
|
||||
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||
-- indicating ownership changes or security issues.
|
||||
|
||||
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||
|
||||
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||
|
||||
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||
|
||||
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||
|
||||
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||
|
||||
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||
|
||||
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||
|
||||
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||
|
||||
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||
|
||||
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||
|
||||
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||
|
||||
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||
|
||||
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||
|
||||
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||
|
||||
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||
|
||||
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||
|
||||
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||
|
||||
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- DNS change type flags
|
||||
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||
|
||||
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||
|
||||
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,5 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return topDomain;
|
||||
}
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||
return Instant.ofEpochMilli(json.getAsLong());
|
||||
} else {
|
||||
throw new JsonParseException("Expected a number for Instant");
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class EdgeDomainTest {
|
||||
|
||||
@Test
|
||||
public void testSkepdic() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHkDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
public class ProcessEventLog {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||
|
||||
private final String serviceName;
|
||||
private final UUID instanceUuid;
|
||||
private final String serviceBase;
|
||||
|
||||
@Inject
|
||||
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||
this.instanceUuid = configuration.instanceUuid();
|
||||
this.serviceBase = configuration.processName();
|
||||
|
||||
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||
|
||||
logEvent("PCS-START", serviceName);
|
||||
}
|
||||
|
||||
public void logEvent(Class<?> type, String message) {
|
||||
logEvent(type.getSimpleName(), message);
|
||||
}
|
||||
public void logEvent(String type, String message) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUuid.toString());
|
||||
stmt.setString(4, type);
|
||||
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to log event {}:{}", type, message);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,17 +1,21 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
* be discovered by other services on the network.
|
||||
*/
|
||||
@ImplementedBy(ZkServiceRegistry.class)
|
||||
public interface ServiceRegistryIf {
|
||||
/**
|
||||
* Register a service with the registry.
|
||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
||||
* </ul>
|
||||
* */
|
||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
|
||||
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.Watcher;
|
||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
.forPath("/running-instances");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(path);
|
||||
livenessPaths.add(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deregisterProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.delete().forPath(path);
|
||||
livenessPaths.remove(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||
if (stopped)
|
||||
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||
|
||||
String path = "/semaphores/" + name;
|
||||
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
public synchronized void shutDown() {
|
||||
if (stopped)
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
@@ -37,6 +38,7 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -6,11 +6,13 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
SYNC_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -35,7 +37,8 @@ public enum ExecutorActor {
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME);
|
||||
UPDATE_RSS(NodeProfile.REALTIME)
|
||||
;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
|
@@ -49,6 +49,7 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
@@ -68,6 +69,7 @@ public class ExecutorActorControlService {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@@ -88,6 +90,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
@@ -109,6 +112,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.SYNC_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -0,0 +1,178 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
@Singleton
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final Gson gson;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||
public record Run(int attempts) implements ActorStep {}
|
||||
@Terminal
|
||||
public record Aborted() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest();
|
||||
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
null);
|
||||
|
||||
yield new Monitor(0);
|
||||
}
|
||||
case Monitor(int errorAttempts) -> {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
|
||||
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||
yield new Monitor(0);
|
||||
}
|
||||
// else continue
|
||||
} else {
|
||||
// Special: Associate this thread with the message so that we can get tracking
|
||||
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||
|
||||
yield new Run(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
case Run(int attempts) -> {
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
var exec = new TaskExecution();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
if (exec.isError()) {
|
||||
if (attempts < MAX_ATTEMPTS)
|
||||
yield new Run(attempts + 1);
|
||||
else
|
||||
yield new Error();
|
||||
}
|
||||
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||
// To avoid boot loops, we transition to error if the process
|
||||
// didn't run for longer than 1 seconds. This might happen if
|
||||
// the process crashes before it can reach the heartbeat and inbox
|
||||
// stages of execution. In this case it would not report having acted
|
||||
// on its message, and the process would be restarted forever without
|
||||
// the attempts counter incrementing.
|
||||
yield new Error("Process terminated within 1 seconds of starting");
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
}
|
||||
|
||||
yield new Monitor(attempts);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessService.ProcessId.PING;
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
* the service respawning on the same task when we
|
||||
* re-enable this actor */
|
||||
private void setCurrentMessageToDead() {
|
||||
try {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||
return;
|
||||
|
||||
var theMessage = messages.iterator().next();
|
||||
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encapsulates the execution of the process in a separate thread so that
|
||||
* we can interrupt the thread if the process is cancelled */
|
||||
private class TaskExecution {
|
||||
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
}
|
||||
}).get(); // Wait for the process to start
|
||||
}
|
||||
|
||||
public boolean isError() {
|
||||
return error.get();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,53 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Run() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run();
|
||||
}
|
||||
}
|
||||
case Run() -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Sync NSFW filters";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
}
|
@@ -8,6 +8,7 @@ import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.task.ExportTasksMain;
|
||||
@@ -41,6 +42,7 @@ public class ProcessService {
|
||||
return switch (id) {
|
||||
case "converter" -> ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessId.CRAWLER;
|
||||
case "ping" -> ProcessId.PING;
|
||||
case "loader" -> ProcessId.LOADER;
|
||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||
@@ -50,6 +52,7 @@ public class ProcessService {
|
||||
|
||||
public enum ProcessId {
|
||||
CRAWLER(CrawlerMain.class),
|
||||
PING(PingMain.class),
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
@@ -68,6 +71,7 @@ public class ProcessService {
|
||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND SERVER_AVAILABLE
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
|
@@ -22,12 +22,13 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':third-party:rssreader')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.sqlite
|
||||
@@ -57,8 +58,6 @@ dependencies {
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.livecapture.BrowserlessClient;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class DomSampleService {
|
||||
private final DomSampleDb db;
|
||||
private final HikariDataSource mariadbDataSource;
|
||||
private final URI browserlessURI;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleService(DomSampleDb db,
|
||||
HikariDataSource mariadbDataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
throws URISyntaxException
|
||||
{
|
||||
this.db = db;
|
||||
this.mariadbDataSource = mariadbDataSource;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
browserlessURI = null;
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (browserlessURI == null) {
|
||||
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||
return;
|
||||
}
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
Set<String> dbDomains = new HashSet<>();
|
||||
|
||||
logger.info("Fetching domains from database...");
|
||||
|
||||
try (var conn = mariadbDataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
""")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} domains in database", dbDomains.size());
|
||||
|
||||
db.syncDomains(dbDomains);
|
||||
|
||||
logger.info("Synced domains to sqlite");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
|
||||
try {
|
||||
// Grace sleep in case we're operating on an empty domain list
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
|
||||
syncDomains();
|
||||
var domains = db.getScheduledDomains();
|
||||
|
||||
for (var domain : domains) {
|
||||
updateDomain(client, domain);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.info("DomSampleService interrupted, stopping...");
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomain(BrowserlessClient client, String domain) {
|
||||
var rootUrl = "https://" + domain + "/";
|
||||
try {
|
||||
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||
|
||||
if (content.isPresent()) {
|
||||
db.saveSample(domain, rootUrl, content.get());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process domain: " + domain, e);
|
||||
}
|
||||
finally {
|
||||
db.flagDomainAsFetched(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,174 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
}
|
||||
|
||||
public DomSampleDb(Path dbPath) throws SQLException {
|
||||
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||
|
||||
connection = DriverManager.getConnection(dbUrl);
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void syncDomains(Set<String> domains) {
|
||||
Set<String> currentDomains = new HashSet<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
currentDomains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||
Set<String> toAdd = new HashSet<>(domains);
|
||||
|
||||
toRemove.removeAll(domains);
|
||||
toAdd.removeAll(currentDomains);
|
||||
|
||||
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||
) {
|
||||
for (String domain : toRemove) {
|
||||
removeStmt.setString(1, domain);
|
||||
removeStmt.executeUpdate();
|
||||
}
|
||||
|
||||
for (String domain : toAdd) {
|
||||
addStmt.setString(1, domain);
|
||||
addStmt.executeUpdate();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to remove domains", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getScheduledDomains() {
|
||||
List<String> domains = new ArrayList<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
public void flagDomainAsFetched(String domain) {
|
||||
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
samples.add(
|
||||
new Sample(
|
||||
rs.getString("url"),
|
||||
domain,
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||
|
||||
boolean acceptedPopover = false;
|
||||
|
||||
StringBuilder requestTsv = new StringBuilder();
|
||||
if (networkRequests != null) {
|
||||
|
||||
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||
|
||||
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||
String method = request.attr("data-method");
|
||||
String urlAttr = request.attr("data-url");
|
||||
String timestamp = request.attr("data-timestamp");
|
||||
|
||||
requestTsv
|
||||
.append(method)
|
||||
.append('\t')
|
||||
.append(timestamp)
|
||||
.append('\t')
|
||||
.append(urlAttr.replace('\n', ' '))
|
||||
.append("\n");
|
||||
}
|
||||
|
||||
networkRequests.remove();
|
||||
}
|
||||
|
||||
doc.body().removeAttr("id");
|
||||
|
||||
String sample = doc.html();
|
||||
|
||||
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||
|
||||
}
|
||||
|
||||
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE
|
||||
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, url);
|
||||
stmt.setString(3, sample);
|
||||
stmt.setString(4, requests);
|
||||
stmt.setBoolean(5, acceptedPopover);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||
*/
|
||||
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"gotoOptions", gotoOptions,
|
||||
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||
);
|
||||
|
||||
// Launch parameters for the browserless instance to load the extension
|
||||
Map<String, Object> launchParameters = Map.of(
|
||||
"args", List.of("--load-extension=/dom-export")
|
||||
);
|
||||
|
||||
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
||||
}
|
||||
else {
|
||||
EdgeDomain domain = domainNameOpt.get();
|
||||
String domainNameStr = domain.toString();
|
||||
|
||||
if (!isValidDomainForCapture(domain)) {
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
|
@@ -1,66 +0,0 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("fetching:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
||||
}
|
||||
}
|
||||
}
|
@@ -5,6 +5,8 @@ import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.executor.client.ExecutorClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
@@ -51,12 +53,13 @@ public class FeedFetcherService {
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
private final ExecutorClient executorClient;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private volatile boolean updating;
|
||||
|
||||
@Inject
|
||||
public FeedFetcherService(FeedDb feedDb,
|
||||
DomainCoordinator domainCoordinator,
|
||||
FileStorageService fileStorageService,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ServiceHeartbeat serviceHeartbeat,
|
||||
@@ -67,6 +70,7 @@ public class FeedFetcherService {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -132,7 +136,7 @@ public class FeedFetcherService {
|
||||
};
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomSampleDbTest {
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
tempDir = Files.createTempDirectory("test");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetUp() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to set up database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSyncDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("example.com", "test.com"));
|
||||
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
db.flagDomainAsFetched("example.com");
|
||||
db.flagDomainAsFetched("test.com");
|
||||
db.flagDomainAsFetched("foobar.com");
|
||||
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||
db.flagDomainAsFetched("test.com");
|
||||
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadSingle() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(1, samples.size());
|
||||
var sample = samples.getFirst();
|
||||
assertEquals("example.com", sample.domain());
|
||||
assertEquals("http://example.com/sample", sample.url());
|
||||
assertEquals("sample data", sample.sample());
|
||||
assertEquals("requests data", sample.requests());
|
||||
assertTrue(sample.acceptedPopover());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadTwo() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(2, samples.size());
|
||||
|
||||
Map<String, String> samplesByUrl = new HashMap<>();
|
||||
for (var sample : samples) {
|
||||
samplesByUrl.put(sample.url(), sample.sample());
|
||||
}
|
||||
|
||||
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.images.PullPolicy;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
// Run gradle docker if this image is not available
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||
.withNetworkMode("bridge")
|
||||
.withLogConsumer(frame -> {
|
||||
System.out.print(frame.getUtf8String());
|
||||
})
|
||||
.withExposedPorts(3000);
|
||||
|
||||
static WireMockServer wireMockServer =
|
||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
||||
static String localIp;
|
||||
|
||||
static URI browserlessURI;
|
||||
static URI browserlessWssURI;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws IOException {
|
||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||
container.getHost(),
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||
|
||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnnotatedContent() throws Exception {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI);
|
||||
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||
) {
|
||||
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||
System.out.println(content);
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
|
||||
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||
System.out.println("Sample URL: " + sample.url());
|
||||
System.out.println("Sample Content: " + sample.sample());
|
||||
System.out.println("Sample Requests: " + sample.requests());
|
||||
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||
});
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
implementation libs.commons.lang3
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation project(':code:common:service')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
@@ -0,0 +1,192 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
@Singleton
|
||||
public class NsfwDomainFilter {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final List<String> dangerLists;
|
||||
private final List<String> smutLists;
|
||||
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||
|
||||
public static final int NSFW_DISABLE = 0;
|
||||
public static final int NSFW_BLOCK_DANGER = 1;
|
||||
public static final int NSFW_BLOCK_SMUT = 2;
|
||||
|
||||
@Inject
|
||||
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||
@Named("nsfw.smutLists") List<String> smutLists
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.dangerLists = dangerLists;
|
||||
this.smutLists = smutLists;
|
||||
|
||||
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||
while (true) {
|
||||
sync();
|
||||
try {
|
||||
TimeUnit.HOURS.sleep(1);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break; // Exit the loop if interrupted
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isBlocked(int domainId, int tier) {
|
||||
if (tier == 0)
|
||||
return false;
|
||||
|
||||
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||
return true;
|
||||
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private synchronized void sync() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
int tier = rs.getInt("TIER");
|
||||
|
||||
switch (tier) {
|
||||
case 1 -> tier1.add(domainId);
|
||||
case 2 -> tier2.add(domainId);
|
||||
}
|
||||
}
|
||||
|
||||
this.blockedDomainIdsTier1 = tier1;
|
||||
this.blockedDomainIdsTier2 = tier2;
|
||||
|
||||
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to sync NSFW domain filter", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void fetchLists() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||
.build();
|
||||
var stmt = conn.createStatement();
|
||||
var insertStmt = conn.prepareStatement("INSERT IGNORE INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||
|
||||
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||
|
||||
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||
for (var dangerListUrl : dangerLists) {
|
||||
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedDangerList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||
for (var smutListUrl : smutLists) {
|
||||
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedSmutList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.addBatch();
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
stmt.execute("""
|
||||
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||
""");
|
||||
stmt.execute("""
|
||||
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||
""");
|
||||
sync();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> fetchList(HttpClient client, String url) {
|
||||
|
||||
logger.info("Fetching NSFW domain list from {}", url);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(java.net.URI.create(url))
|
||||
.build();
|
||||
|
||||
try {
|
||||
if (url.endsWith(".gz")) {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
byte[] body = response.body();
|
||||
|
||||
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||
return reader.lines()
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} catch (Exception e) {
|
||||
logger.error("Error reading GZIP response from {}", url, e);
|
||||
}
|
||||
} else {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
if (response.statusCode() == 200) {
|
||||
|
||||
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} else {
|
||||
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||
}
|
||||
|
||||
|
||||
return List.of();
|
||||
}
|
||||
}
|
@@ -0,0 +1,30 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import jakarta.inject.Named;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class NsfwFilterModule extends AbstractModule {
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||
);
|
||||
}
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {}
|
||||
}
|
@@ -0,0 +1,108 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Provides;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class NsfwDomainFilterTest extends AbstractModule {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
static Path tempDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpDb() throws IOException {
|
||||
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||
|
||||
System.setProperty("system.homePath", tempDir.toString());
|
||||
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||
) {
|
||||
|
||||
// Ensure the database is ready
|
||||
conn.createStatement().execute("SELECT 1");
|
||||
|
||||
stmt.setString(1, "www.google.com");
|
||||
stmt.setString(2, "google.com");
|
||||
stmt.executeUpdate();
|
||||
stmt.setString(1, "www.bing.com");
|
||||
stmt.setString(2, "bing.com");
|
||||
stmt.executeUpdate();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to connect to the database", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list1"
|
||||
);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list2.gz"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
var filter = Guice
|
||||
.createInjector(this)
|
||||
.getInstance(NsfwDomainFilter.class);
|
||||
|
||||
filter.fetchLists();
|
||||
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
}
|
||||
|
||||
}
|
@@ -1,9 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
|
||||
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
|
||||
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
|
||||
request.getSearchSetIdentifier(),
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
|
||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||
.setSearchSetIdentifier(params.identifier())
|
||||
.setQueryStrategy(params.queryStrategy().name())
|
||||
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||
.build())
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum NsfwFilterTier {
|
||||
OFF(0),
|
||||
DANGER(1),
|
||||
PORN_AND_GAMBLING(2);
|
||||
|
||||
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||
|
||||
NsfwFilterTier(int codedValue) {
|
||||
this.codedValue = codedValue;
|
||||
}
|
||||
|
||||
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||
if (tier.codedValue == codedValue) {
|
||||
return tier;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||
}
|
||||
|
||||
public int getCodedValue() {
|
||||
return codedValue;
|
||||
}
|
||||
}
|
@@ -25,10 +25,11 @@ public record QueryParams(
|
||||
String identifier,
|
||||
QueryStrategy queryStrategy,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
NsfwFilterTier filterTier,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -43,6 +44,7 @@ public record QueryParams(
|
||||
identifier,
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
filterTier,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
|
||||
RpcQsQueryPagination pagination = 17;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
/* Query service query response */
|
||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
||||
RpcQueryLimits queryLimits = 10;
|
||||
string queryStrategy = 11; // Named query configuration
|
||||
RpcResultRankingParameters parameters = 12;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/* A tagged union encoding some limit on a field */
|
||||
message RpcSpecLimit {
|
||||
int32 value = 1;
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:index:query')
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -34,13 +35,16 @@ public class QueryGRPCService
|
||||
|
||||
|
||||
private final QueryFactory queryFactory;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final IndexClient indexClient;
|
||||
|
||||
@Inject
|
||||
public QueryGRPCService(QueryFactory queryFactory,
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
IndexClient indexClient)
|
||||
{
|
||||
this.queryFactory = queryFactory;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
|
||||
"NONE",
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
|
@@ -17,6 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -28,14 +30,26 @@ public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
Counter wmsa_index_query_count = Counter.build()
|
||||
.name("wmsa_nsfw_filter_result_count")
|
||||
.labelNames("tier")
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
DomainBlacklistImpl blacklist,
|
||||
NsfwDomainFilter nsfwDomainFilter
|
||||
) {
|
||||
this.channelPool = channelPoolFactory.createMulti(
|
||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||
IndexApiGrpc::newBlockingStub);
|
||||
this.blacklist = blacklist;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
@@ -52,7 +66,7 @@ public class IndexClient {
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
|
||||
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
List<RpcDecoratedResultItem> results =
|
||||
@@ -74,7 +88,7 @@ public class IndexClient {
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.filter(item -> !isBlacklisted(item, filterTier))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
@@ -83,8 +97,23 @@ public class IndexClient {
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
static String[] tierNames = {
|
||||
"OFF",
|
||||
"DANGER",
|
||||
"NSFW"
|
||||
};
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||
|
||||
if (blacklist.isBlacklisted(domainId)) {
|
||||
return true;
|
||||
}
|
||||
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
||||
|
||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||
for (var instance : journal.pages()) {
|
||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||
{
|
||||
|
32
code/libraries/domain-lock/build.gradle
Normal file
32
code/libraries/domain-lock/build.gradle
Normal file
@@ -0,0 +1,32 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public class DefaultDomainPermits {
|
||||
|
||||
public static int defaultPermits(EdgeDomain domain) {
|
||||
return defaultPermits(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static int defaultPermits(String topDomain) {
|
||||
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return 16;
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return 8;
|
||||
if (topDomain.equals("github.io"))
|
||||
return 8;
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,17 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class DomainCoordinationModule extends AbstractModule {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainCoordinationModule.class);
|
||||
|
||||
public DomainCoordinationModule() {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(ZookeeperDomainCoordinator.class);
|
||||
}
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Optional;
|
||||
|
||||
public interface DomainCoordinator {
|
||||
DomainLock lockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
boolean isLockableHint(EdgeDomain domain);
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
public interface DomainLock extends AutoCloseable {
|
||||
void close();
|
||||
}
|
@@ -1,16 +1,17 @@
|
||||
package nu.marginalia.crawl.logic;
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
@Singleton
|
||||
public class LocalDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
@@ -24,13 +25,25 @@ public class DomainLocks {
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
return new LocalDomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1, timeout.toMillis(), TimeUnit.MILLISECONDS)) {
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
@@ -39,24 +52,7 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
return new Semaphore(DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
@@ -71,15 +67,15 @@ public class DomainLocks {
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
public static class LocalDomainLock implements DomainLock {
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(Semaphore semaphore) {
|
||||
LocalDomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
@@ -0,0 +1,116 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.framework.recipes.locks.Lease;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, InterProcessSemaphoreV2> locks = new ConcurrentHashMap<>();
|
||||
private final Map<String, Integer> waitCounts = new ConcurrentHashMap<>();
|
||||
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final int nodeId;
|
||||
|
||||
@Inject
|
||||
public ZookeeperDomainCoordinator(ServiceRegistryIf serviceRegistry, @Named("wmsa-system-node") int nodeId) {
|
||||
// Zookeeper-specific initialization can be done here if needed
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.nodeId = nodeId;
|
||||
}
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
return new ZkDomainLock(sem, sem.acquire());
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to acquire lock for domain: " + domain.topDomain, e);
|
||||
}
|
||||
finally {
|
||||
// Decrement or remove the wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return tryLockDomain(domain, Duration.ofSeconds(1)); // Underlying semaphore doesn't have a tryLock method, so we use a short timeout
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
var lease = sem.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); // Acquire with timeout
|
||||
if (lease != null) {
|
||||
return Optional.of(new ZkDomainLock(sem, lease));
|
||||
}
|
||||
else {
|
||||
return Optional.empty(); // If we fail to acquire the lease, we return an empty optional
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
return Optional.empty(); // If we fail to acquire the lock, we return an empty optional
|
||||
}
|
||||
finally {
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private InterProcessSemaphoreV2 createSemapore(String topDomain){
|
||||
try {
|
||||
return serviceRegistry.getSemaphore(topDomain + ":" + nodeId, DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to get semaphore for domain: " + topDomain, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
return !waitCounts.containsKey(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static class ZkDomainLock implements DomainLock {
|
||||
private final InterProcessSemaphoreV2 semaphore;
|
||||
private final Lease lease;
|
||||
|
||||
ZkDomainLock(InterProcessSemaphoreV2 semaphore, Lease lease) {
|
||||
this.semaphore = semaphore;
|
||||
this.lease = lease;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.returnLease(lease);
|
||||
}
|
||||
}
|
||||
}
|
@@ -15,6 +15,10 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.sources.AsnMapping;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
@@ -10,6 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpDictionary {
|
||||
private volatile IP2LocationMapping ip2locMapping = null;
|
||||
private volatile AsnTable asnTable = null;
|
||||
@@ -76,7 +78,7 @@ public class GeoIpDictionary {
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(int ipAddress) {
|
||||
if (null == asnTable) { // not loaded yet or failed to load
|
||||
if (null == asnMapping || null == asnTable) { // not loaded yet or failed to load
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
||||
"",
|
||||
body.getBytes(StandardCharsets.UTF_8),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
|
||||
float minFontWeight = Integer.MAX_VALUE;
|
||||
for (var word : line)
|
||||
{
|
||||
int i = 0;
|
||||
for (var textPosition : word.getTextPositions())
|
||||
{
|
||||
if (word.text.charAt(i++) == ' ') {
|
||||
continue;
|
||||
}
|
||||
// Skip empty text positions as they may have a different font
|
||||
if (word.text.isBlank()) continue;
|
||||
|
||||
var font = textPosition.getFont();
|
||||
if (font == null) continue;
|
||||
var descriptor = font.getFontDescriptor();
|
||||
|
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
||||
"",
|
||||
readClassPathFile(p.toString()).getBytes(),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
|
||||
));
|
||||
}
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||
}
|
||||
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
||||
@@ -58,6 +59,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
@@ -10,9 +10,11 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
@@ -25,9 +27,12 @@ import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessEventLog;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -54,6 +59,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final ProcessEventLog eventLog;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@@ -61,9 +67,10 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final HikariDataSource dataSource;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final int node;
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -84,6 +91,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
HttpFetcherImpl httpFetcher,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
ProcessEventLog eventLog,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
@@ -91,6 +99,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
WarcArchiverFactory warcArchiverFactory,
|
||||
HikariDataSource dataSource,
|
||||
DomainBlacklist blacklist,
|
||||
DomainCoordinator domainCoordinator,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
Gson gson) throws InterruptedException {
|
||||
|
||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||
@@ -98,6 +108,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.userAgent = userAgent;
|
||||
this.fetcher = httpFetcher;
|
||||
this.heartbeat = heartbeat;
|
||||
this.eventLog = eventLog;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@@ -105,6 +116,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
@@ -147,12 +160,18 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new CrawlerModule(),
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
var crawler = injector.getInstance(CrawlerMain.class);
|
||||
|
||||
var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||
|
||||
crawler.serviceRegistry.registerProcess("crawler", crawler.node);
|
||||
|
||||
try {
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawling started");
|
||||
var req = instructions.value();
|
||||
if (req.targetDomainName != null) {
|
||||
crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage);
|
||||
@@ -160,11 +179,15 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
else {
|
||||
crawler.runForDatabaseDomains(req.crawlStorage);
|
||||
}
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawl completed successfully");
|
||||
instructions.ok();
|
||||
} catch (Exception ex) {
|
||||
logger.error("Crawler failed", ex);
|
||||
instructions.err();
|
||||
}
|
||||
finally {
|
||||
crawler.serviceRegistry.deregisterProcess("crawler", crawler.node);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(5);
|
||||
}
|
||||
@@ -433,7 +456,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
return domainCoordinator.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -444,7 +467,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
Optional<DomainLock> lock = domainCoordinator.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
@@ -452,7 +475,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
DomainLock domainLock = lock.get();
|
||||
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
@@ -36,6 +36,7 @@ import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.ssl.SSLContextBuilder;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -48,11 +49,15 @@ import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
@@ -87,18 +92,49 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
// No-op up front validation of server certificates.
|
||||
//
|
||||
// We will validate certificates later, after the connection is established
|
||||
// as we want to store the certificate chain and validation
|
||||
// outcome to the database.
|
||||
|
||||
var trustMeBro = new X509TrustManager() {
|
||||
private X509Certificate[] lastServerCertChain;
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) {
|
||||
this.lastServerCertChain = chain.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return new X509Certificate[0];
|
||||
}
|
||||
|
||||
public X509Certificate[] getLastServerCertChain() {
|
||||
return lastServerCertChain != null ? lastServerCertChain.clone() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SSLContext sslContext = SSLContextBuilder.create().build();
|
||||
sslContext.init(null, new TrustManager[]{trustMeBro}, null);
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(sslContext))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
@@ -183,6 +219,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
@@ -193,6 +231,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
Instant date = Instant.now();
|
||||
Instant requestDate = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
Instant responseDate = Instant.now();
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(requestDate)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||
Duration.between(requestDate, responseDate),
|
||||
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(responseDate)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
nextRecord.body,
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies,
|
||||
-1,
|
||||
lastModified,
|
||||
etag));
|
||||
}
|
||||
|
@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
nextRecord.body(),
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies(),
|
||||
nextRecord.requestTimeMs(),
|
||||
null,
|
||||
null));
|
||||
}
|
||||
|
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
|
||||
public String crawlerStatus;
|
||||
public String crawlerStatusDesc;
|
||||
public int requestTimeMs;
|
||||
|
||||
@Nullable
|
||||
public String headers;
|
||||
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
public String lastModifiedMaybe;
|
||||
public String etagMaybe;
|
||||
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
|
||||
this.crawlId = crawlId;
|
||||
this.url = url;
|
||||
this.contentType = contentType;
|
||||
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
||||
this.hasCookies = hasCookies;
|
||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
this.etagMaybe = etagMaybe;
|
||||
}
|
||||
|
||||
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
private byte[] documentBodyBytes = new byte[0];
|
||||
private String recrawlState;
|
||||
private Boolean hasCookies;
|
||||
private int requestTimeMs;
|
||||
private String lastModifiedMaybe;
|
||||
private String etagMaybe;
|
||||
|
||||
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocument build() {
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.primitive.ByteColumn;
|
||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.primitive.ShortColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
long timestamp,
|
||||
String contentType,
|
||||
byte[] body,
|
||||
int requestTimeMs,
|
||||
String headers)
|
||||
{
|
||||
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
||||
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
||||
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
||||
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
|
||||
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
|
||||
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
||||
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
parquetRecord.timestamp.toEpochMilli(),
|
||||
parquetRecord.contentType,
|
||||
parquetRecord.body,
|
||||
-1,
|
||||
parquetRecord.headers
|
||||
);
|
||||
}
|
||||
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=redirect",
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes(),
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
errorStatus,
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Writer timestampColumnWriter;
|
||||
private final EnumColumn.Writer contentTypeColumnWriter;
|
||||
private final ByteArrayColumn.Writer bodyColumnWriter;
|
||||
private final ShortColumn.Writer requestTimeColumnWriter;
|
||||
private final StringColumn.Writer headerColumnWriter;
|
||||
|
||||
public Writer(Path path) throws IOException {
|
||||
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter = timestampColumn.create(this);
|
||||
contentTypeColumnWriter = contentTypeColumn.create(this);
|
||||
bodyColumnWriter = bodyColumn.create(this);
|
||||
requestTimeColumnWriter = requestTimeColumn.create(this);
|
||||
headerColumnWriter = headerColumn.create(this);
|
||||
}
|
||||
|
||||
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter.put(record.timestamp);
|
||||
contentTypeColumnWriter.put(record.contentType);
|
||||
bodyColumnWriter.put(record.body);
|
||||
requestTimeColumnWriter.put((short) record.requestTimeMs);
|
||||
headerColumnWriter.put(record.headers);
|
||||
}
|
||||
|
||||
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
String headersStr;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
int requestTimeMs = -1;
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||
hasCookies = true;
|
||||
}
|
||||
if (header.getName().equals("X-Marginalia-Response-Time")) {
|
||||
try {
|
||||
requestTimeMs = Integer.parseInt(header.getValue());
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
response.date().toEpochMilli(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
requestTimeMs,
|
||||
headersStr
|
||||
)
|
||||
);
|
||||
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
public Reader(Path path) throws IOException {
|
||||
@@ -475,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public SlopCrawlDataRecord get() throws IOException {
|
||||
@@ -487,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader.get(),
|
||||
contentTypeColumnReader.get(),
|
||||
bodyColumnReader.get(),
|
||||
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
|
||||
headerColumnReader.get()
|
||||
);
|
||||
}
|
||||
@@ -506,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
private SlopCrawlDataRecord next = null;
|
||||
@@ -522,6 +557,17 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public abstract boolean filter(String url, int status, String contentType);
|
||||
@@ -548,6 +594,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
boolean cookies = cookiesColumnReader.get() == 1;
|
||||
int status = statusColumnReader.get();
|
||||
long timestamp = timestampColumnReader.get();
|
||||
int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
|
||||
String contentType = contentTypeColumnReader.get();
|
||||
|
||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||
@@ -555,7 +602,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
if (filter(url, status, contentType)) {
|
||||
next = new SlopCrawlDataRecord(
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
|
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
||||
headers,
|
||||
body,
|
||||
false,
|
||||
-1,
|
||||
"",
|
||||
""
|
||||
));
|
||||
|
@@ -10,6 +10,8 @@ import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
@@ -58,6 +60,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
@@ -71,7 +74,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
DomainProcessor domainProcessor,
|
||||
FileStorageService fileStorageService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
|
||||
DocumentLoaderService documentLoaderService, DomainCoordinator domainCoordinator, HikariDataSource dataSource)
|
||||
throws Exception
|
||||
{
|
||||
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||
@@ -84,6 +87,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.keywordLoaderService = keywordLoaderService;
|
||||
this.documentLoaderService = documentLoaderService;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
@@ -107,6 +111,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
try {
|
||||
Injector injector = Guice.createInjector(
|
||||
new LiveCrawlerModule(),
|
||||
new DomainCoordinationModule(), // 2 hours lease timeout is enough for the live crawler
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ConverterModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
@@ -172,7 +177,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.CRAWLING);
|
||||
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainQueries, domainBlacklist);
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, domainBlacklist);
|
||||
var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
|
||||
{
|
||||
for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
|
||||
|
@@ -5,8 +5,9 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
@@ -46,14 +47,16 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
|
||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||
DomainCoordinator domainCoordinator,
|
||||
DbDomainQueries domainQueries,
|
||||
DomainBlacklist domainBlacklist) {
|
||||
this.dataSet = dataSet;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.domainQueries = domainQueries;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
}
|
||||
@@ -98,7 +101,7 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain)
|
||||
DomainLock lock = domainCoordinator.lockDomain(domain)
|
||||
) {
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.livecrawler;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -37,7 +38,7 @@ class SimpleLinkScraperTest {
|
||||
|
||||
@Test
|
||||
public void testRetrieveNow() throws Exception {
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(), null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(1, fetched);
|
||||
|
||||
@@ -57,7 +58,7 @@ class SimpleLinkScraperTest {
|
||||
@Test
|
||||
public void testRetrieveNow_Redundant() throws Exception {
|
||||
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
|
||||
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
|
72
code/processes/ping-process/build.gradle
Normal file
72
code/processes/ping-process/build.gradle
Normal file
@@ -0,0 +1,72 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation 'org.bouncycastle:bcprov-jdk18on:1.80'
|
||||
implementation 'org.bouncycastle:bcpkix-jdk18on:1.80'
|
||||
implementation 'dnsjava:dnsjava:3.5.2'
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,84 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
public class BackoffStrategy {
|
||||
|
||||
private final Map<ErrorClassification, Duration> baseIntervals;
|
||||
private final Map<ErrorClassification, Duration> maxIntervals;
|
||||
private final Duration okInterval;
|
||||
|
||||
@Inject
|
||||
public BackoffStrategy(PingIntervalsConfiguration pingIntervalsConfiguration) {
|
||||
this.baseIntervals = pingIntervalsConfiguration.baseIntervals();
|
||||
this.maxIntervals = pingIntervalsConfiguration.maxIntervals();
|
||||
this.okInterval = baseIntervals.get(ErrorClassification.NONE);
|
||||
}
|
||||
|
||||
public Duration getOkInterval() {
|
||||
return okInterval;
|
||||
}
|
||||
|
||||
public Duration getUpdateTime(Duration currentDuration,
|
||||
ErrorClassification errorClassification,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
Duration nextBackoff = calculateBackoff(errorClassification, currentDuration, backoffConsecutiveFailures + 1);
|
||||
nextBackoff = addJitter(nextBackoff);
|
||||
|
||||
return nextBackoff;
|
||||
}
|
||||
|
||||
private Duration calculateBackoff(ErrorClassification errorClassification,
|
||||
Duration currentDuration,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
if (currentDuration == null) {
|
||||
return baseIntervals.get(errorClassification);
|
||||
}
|
||||
|
||||
Duration baseInterval = baseIntervals.get(errorClassification);
|
||||
Duration maxInterval = maxIntervals.get(errorClassification);
|
||||
|
||||
if (currentDuration.compareTo(maxInterval) >= 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
double multiplier = switch(errorClassification) {
|
||||
case ErrorClassification.UNKNOWN -> 1.5;
|
||||
case ErrorClassification.TIMEOUT -> 2.5;
|
||||
case ErrorClassification.CONNECTION_ERROR -> 2.0;
|
||||
case ErrorClassification.HTTP_CLIENT_ERROR -> 1.7;
|
||||
case ErrorClassification.HTTP_SERVER_ERROR -> 2.0;
|
||||
case ErrorClassification.SSL_ERROR -> 1.8;
|
||||
case ErrorClassification.DNS_ERROR -> 1.5;
|
||||
default -> 2.0; // Default multiplier for any other classification
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
// Add ±15% jitter to prevent synchronized retries
|
||||
double jitterPercent = 0.15;
|
||||
long baseMinutes = duration.toMinutes();
|
||||
long jitterRange = (long) (baseMinutes * jitterPercent * 2);
|
||||
long jitterOffset = ThreadLocalRandom.current().nextLong(jitterRange + 1) - (jitterRange / 2);
|
||||
|
||||
long finalMinutes = Math.max(1, baseMinutes + jitterOffset);
|
||||
return Duration.ofMinutes(finalMinutes);
|
||||
}
|
||||
}
|
259
code/processes/ping-process/java/nu/marginalia/ping/PingDao.java
Normal file
259
code/processes/ping-process/java/nu/marginalia/ping/PingDao.java
Normal file
@@ -0,0 +1,259 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.ping.model.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
@Singleton
|
||||
public class PingDao {
|
||||
private final HikariDataSource dataSource;
|
||||
private static final Gson gson = GsonFactory.get();
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingDao.class);
|
||||
|
||||
@Inject
|
||||
public PingDao(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public void write(WritableModel model) {
|
||||
write(List.of(model));
|
||||
}
|
||||
|
||||
public void write(Collection<WritableModel> models) {
|
||||
logger.debug("Writing: {}", models);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
|
||||
// Don't bother with a transaction if there's only one model to write.
|
||||
if (models.size() <= 1) {
|
||||
for (WritableModel model : models) {
|
||||
model.write(conn);
|
||||
}
|
||||
}
|
||||
else { // If there are multiple models, use a transaction to ensure atomicity.
|
||||
conn.setAutoCommit(false);
|
||||
try {
|
||||
for (WritableModel model : models) {
|
||||
model.write(conn);
|
||||
}
|
||||
conn.commit();
|
||||
} catch (SQLException e) {
|
||||
conn.rollback();
|
||||
throw e;
|
||||
} finally {
|
||||
conn.setAutoCommit(true);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to write model", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void scheduleDnsUpdate(String rootDomainName, Instant timestamp, int priority) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
UPDATE DOMAIN_DNS_INFORMATION
|
||||
SET TS_NEXT_DNS_CHECK = ?, DNS_CHECK_PRIORITY = ?
|
||||
WHERE ROOT_DOMAIN_NAME = ?
|
||||
""")) {
|
||||
|
||||
ps.setTimestamp(1, java.sql.Timestamp.from(timestamp));
|
||||
ps.setInt(2, priority);
|
||||
ps.setString(3, rootDomainName);
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public DomainAvailabilityRecord getDomainPingStatus(int domainId) throws SQLException {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_AVAILABILITY_INFORMATION WHERE domain_id = ?")) {
|
||||
|
||||
ps.setInt(1, domainId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainAvailabilityRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public DomainSecurityRecord getDomainSecurityInformation(int domainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_SECURITY_INFORMATION WHERE domain_id = ?")) {
|
||||
|
||||
ps.setInt(1, domainId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainSecurityRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public DomainDnsRecord getDomainDnsRecord(long dnsRootDomainId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_DNS_INFORMATION WHERE DNS_ROOT_DOMAIN_ID = ?")) {
|
||||
|
||||
ps.setObject(1, dnsRootDomainId, java.sql.Types.INTEGER);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainDnsRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public DomainDnsRecord getDomainDnsRecord(String rootDomainName) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("SELECT * FROM DOMAIN_DNS_INFORMATION WHERE ROOT_DOMAIN_NAME = ?")) {
|
||||
|
||||
ps.setString(1, rootDomainName);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return new DomainDnsRecord(rs);
|
||||
} else {
|
||||
return null; // or throw an exception if preferred
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public HistoricalAvailabilityData getHistoricalAvailabilityData(long domainId) throws SQLException {
|
||||
var query = """
|
||||
SELECT EC_DOMAIN.ID, EC_DOMAIN.DOMAIN_NAME, EC_DOMAIN.NODE_AFFINITY, DOMAIN_AVAILABILITY_INFORMATION.*, DOMAIN_SECURITY_INFORMATION.*
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_SECURITY_INFORMATION ON DOMAIN_SECURITY_INFORMATION.DOMAIN_ID = EC_DOMAIN.ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION ON DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID = EC_DOMAIN.ID
|
||||
WHERE EC_DOMAIN.ID = ?
|
||||
""";
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement(query)) {
|
||||
|
||||
ps.setLong(1, domainId);
|
||||
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
String domainName = rs.getString("EC_DOMAIN.DOMAIN_NAME");
|
||||
|
||||
DomainAvailabilityRecord dar;
|
||||
DomainSecurityRecord dsr;
|
||||
|
||||
if (rs.getObject("DOMAIN_SECURITY_INFORMATION.DOMAIN_ID", Integer.class) != null)
|
||||
dsr = new DomainSecurityRecord(rs);
|
||||
else
|
||||
dsr = null;
|
||||
|
||||
if (rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID", Integer.class) != null)
|
||||
dar = new DomainAvailabilityRecord(rs);
|
||||
else
|
||||
dar = null;
|
||||
|
||||
if (dar == null) {
|
||||
return new HistoricalAvailabilityData.JustDomainReference(new DomainReference(
|
||||
rs.getInt("EC_DOMAIN.ID"),
|
||||
rs.getInt("EC_DOMAIN.NODE_AFFINITY"),
|
||||
domainName.toLowerCase()
|
||||
));
|
||||
}
|
||||
else {
|
||||
if (dsr != null) {
|
||||
return new HistoricalAvailabilityData.AvailabilityAndSecurity(domainName, dar, dsr);
|
||||
} else {
|
||||
return new HistoricalAvailabilityData.JustAvailability(domainName, dar);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> getDomainUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<DomainReference, HistoricalAvailabilityData>> updateJobs = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT ID, DOMAIN_NAME, NEXT_SCHEDULED_UPDATE
|
||||
FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION
|
||||
ON EC_DOMAIN.ID = DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID
|
||||
WHERE NODE_AFFINITY = ?
|
||||
""")) {
|
||||
ps.setFetchSize(10_000);
|
||||
ps.setInt(1, nodeId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
var ts = rs.getTimestamp("NEXT_SCHEDULED_UPDATE");
|
||||
Instant nextUpdate = ts == null ? Instant.now() : ts.toInstant();
|
||||
|
||||
var ref = new DomainReference(domainId, nodeId, domainName.toLowerCase());
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(ref, nextUpdate));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to retrieve domain update schedule", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} availability update jobs for node {}", updateJobs.size(), nodeId);
|
||||
|
||||
return updateJobs;
|
||||
}
|
||||
|
||||
public List<UpdateSchedule.UpdateJob<RootDomainReference, RootDomainReference>> getDnsUpdateSchedule(int nodeId) {
|
||||
List<UpdateSchedule.UpdateJob<RootDomainReference, RootDomainReference>> updateJobs = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT DISTINCT(DOMAIN_TOP),DOMAIN_DNS_INFORMATION.* FROM EC_DOMAIN
|
||||
LEFT JOIN DOMAIN_DNS_INFORMATION ON ROOT_DOMAIN_NAME = DOMAIN_TOP
|
||||
WHERE EC_DOMAIN.NODE_AFFINITY = ?
|
||||
""")) {
|
||||
ps.setFetchSize(10_000);
|
||||
ps.setInt(1, nodeId);
|
||||
ResultSet rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
Long dnsRootDomainId = rs.getObject("DOMAIN_DNS_INFORMATION.DNS_ROOT_DOMAIN_ID", Long.class);
|
||||
String rootDomainName = rs.getString("DOMAIN_TOP");
|
||||
|
||||
if (dnsRootDomainId == null) {
|
||||
updateJobs.add(
|
||||
new UpdateSchedule.UpdateJob<>(
|
||||
new RootDomainReference.ByName(rootDomainName),
|
||||
Instant.now())
|
||||
);
|
||||
}
|
||||
else {
|
||||
var record = new DomainDnsRecord(rs);
|
||||
updateJobs.add(new UpdateSchedule.UpdateJob<>(
|
||||
new RootDomainReference.ByIdAndName(dnsRootDomainId, rootDomainName),
|
||||
Objects.requireNonNullElseGet(record.tsNextScheduledUpdate(), Instant::now))
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to retrieve DNS update schedule", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} dns update jobs for node {}", updateJobs.size(), nodeId);
|
||||
|
||||
return updateJobs;
|
||||
}
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
|
||||
public record PingIntervalsConfiguration(
|
||||
Duration dnsUpdateInterval,
|
||||
Map<ErrorClassification, Duration> baseIntervals,
|
||||
Map<ErrorClassification, Duration> maxIntervals
|
||||
) {
|
||||
}
|
@@ -0,0 +1,297 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.ping.model.*;
|
||||
import nu.marginalia.ping.svc.DnsPingService;
|
||||
import nu.marginalia.ping.svc.HttpPingService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** PingJobScheduler is responsible for scheduling and processing ping jobs
|
||||
* for both HTTP pings and DNS lookups. It manages a queue of jobs and processes them
|
||||
* in separate threads, ensuring that domains are pinged and DNS records are updated
|
||||
* efficiently.
|
||||
*/
|
||||
public class PingJobScheduler {
|
||||
private final HttpPingService httpPingService;
|
||||
private final DnsPingService dnsPingService;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final PingDao pingDao;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingJobScheduler.class);
|
||||
|
||||
private static final UpdateSchedule<RootDomainReference, RootDomainReference> dnsUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
private static final UpdateSchedule<DomainReference, HistoricalAvailabilityData> availabilityUpdateSchedule
|
||||
= new UpdateSchedule<>(250_000);
|
||||
|
||||
public volatile Instant dnsLastSync = Instant.now();
|
||||
public volatile Instant availabilityLastSync = Instant.now();
|
||||
|
||||
public volatile Integer nodeId = null;
|
||||
public volatile boolean running = false;
|
||||
|
||||
private final List<Thread> allThreads = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
public PingJobScheduler(HttpPingService httpPingService,
|
||||
DnsPingService dnsPingService,
|
||||
DomainCoordinator domainCoordinator,
|
||||
PingDao pingDao)
|
||||
{
|
||||
this.httpPingService = httpPingService;
|
||||
this.dnsPingService = dnsPingService;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.pingDao = pingDao;
|
||||
}
|
||||
|
||||
public synchronized void start() {
|
||||
if (running)
|
||||
return;
|
||||
|
||||
nodeId = null;
|
||||
|
||||
running = true;
|
||||
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("sync-dns").start(this::syncAvailabilityJobs));
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("sync-availability").start(this::syncDnsRecords));
|
||||
|
||||
int availabilityThreads = Integer.getInteger("ping.availabilityThreads", 8);
|
||||
int pingThreads = Integer.getInteger("ping.dnsThreads", 2);
|
||||
|
||||
for (int i = 0; i < availabilityThreads; i++) {
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("availability-job-consumer-" + i).start(this::availabilityJobConsumer));
|
||||
}
|
||||
for (int i = 0; i < pingThreads; i++) {
|
||||
allThreads.add(Thread.ofPlatform().daemon().name("dns-job-consumer-" + i).start(this::dnsJobConsumer));
|
||||
}
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
running = false;
|
||||
for (Thread thread : allThreads) {
|
||||
try {
|
||||
thread.interrupt();
|
||||
thread.join();
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("Failed to join thread: " + thread.getName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void pause(int nodeId) {
|
||||
logger.info("Pausing PingJobScheduler for nodeId: {}", nodeId);
|
||||
|
||||
if (this.nodeId != null && this.nodeId != nodeId) {
|
||||
logger.warn("Attempted to pause PingJobScheduler with mismatched nodeId: expected {}, got {}", this.nodeId, nodeId);
|
||||
return;
|
||||
}
|
||||
this.nodeId = null;
|
||||
|
||||
availabilityUpdateSchedule.clear();
|
||||
dnsUpdateSchedule.clear();
|
||||
|
||||
logger.info("PingJobScheduler paused");
|
||||
}
|
||||
|
||||
public synchronized void enableForNode(int nodeId) {
|
||||
logger.info("Resuming PingJobScheduler for nodeId: {}", nodeId);
|
||||
if (this.nodeId != null) {
|
||||
logger.warn("Attempted to resume PingJobScheduler with mismatched nodeId: expected {}, got {}", this.nodeId, nodeId);
|
||||
return;
|
||||
}
|
||||
|
||||
availabilityUpdateSchedule.replaceQueue(pingDao.getDomainUpdateSchedule(nodeId));
|
||||
dnsUpdateSchedule.replaceQueue(pingDao.getDnsUpdateSchedule(nodeId));
|
||||
dnsLastSync = Instant.now();
|
||||
availabilityLastSync = Instant.now();
|
||||
|
||||
// Flag that we are running again
|
||||
this.nodeId = nodeId;
|
||||
|
||||
notifyAll();
|
||||
logger.info("PingJobScheduler resumed");
|
||||
}
|
||||
|
||||
public synchronized void waitForResume() throws InterruptedException {
|
||||
while (nodeId == null) {
|
||||
wait();
|
||||
}
|
||||
}
|
||||
|
||||
private void availabilityJobConsumer() {
|
||||
while (running) {
|
||||
try {
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
DomainReference ref = availabilityUpdateSchedule.nextIf(domain -> {
|
||||
EdgeDomain domainObj = new EdgeDomain(domain.domainName());
|
||||
if (!domainCoordinator.isLockableHint(domainObj)) {
|
||||
return false; // Skip locked domains
|
||||
}
|
||||
return true; // Process this domain
|
||||
});
|
||||
|
||||
long nextId = ref.domainId();
|
||||
var data = pingDao.getHistoricalAvailabilityData(nextId);
|
||||
if (data == null) {
|
||||
logger.warn("No availability data found for ID: {}", nextId);
|
||||
continue; // No data to process, skip this iteration
|
||||
}
|
||||
|
||||
try {
|
||||
List<WritableModel> objects = switch (data) {
|
||||
case HistoricalAvailabilityData.JustDomainReference(DomainReference reference)
|
||||
-> httpPingService.pingDomain(reference, null, null);
|
||||
case HistoricalAvailabilityData.JustAvailability(String domain, DomainAvailabilityRecord record)
|
||||
-> httpPingService.pingDomain(
|
||||
new DomainReference(record.domainId(), record.nodeId(), domain), record, null);
|
||||
case HistoricalAvailabilityData.AvailabilityAndSecurity(String domain, DomainAvailabilityRecord availability, DomainSecurityRecord security)
|
||||
-> httpPingService.pingDomain(
|
||||
new DomainReference(availability.domainId(), availability.nodeId(), domain), availability, security);
|
||||
};
|
||||
|
||||
pingDao.write(objects);
|
||||
|
||||
// Re-schedule the next update time for the domain
|
||||
for (var object : objects) {
|
||||
var ts = object.nextUpdateTime();
|
||||
if (ts != null) {
|
||||
availabilityUpdateSchedule.add(ref, ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error processing availability job for domain: " + data.domain(), e);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("Availability job consumer interrupted", e);
|
||||
break;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error processing availability job", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void dnsJobConsumer() {
|
||||
while (running) {
|
||||
try {
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
RootDomainReference ref = dnsUpdateSchedule.next();
|
||||
|
||||
try {
|
||||
List<WritableModel> objects = switch(ref) {
|
||||
case RootDomainReference.ByIdAndName(long id, String name) -> {
|
||||
var oldRecord = Objects.requireNonNull(pingDao.getDomainDnsRecord(id));
|
||||
yield dnsPingService.pingDomain(oldRecord.rootDomainName(), oldRecord);
|
||||
}
|
||||
case RootDomainReference.ByName(String name) -> {
|
||||
@Nullable var oldRecord = pingDao.getDomainDnsRecord(name);
|
||||
yield dnsPingService.pingDomain(name, oldRecord);
|
||||
}
|
||||
};
|
||||
|
||||
pingDao.write(objects);
|
||||
|
||||
// Re-schedule the next update time for the domain
|
||||
for (var object : objects) {
|
||||
var ts = object.nextUpdateTime();
|
||||
if (ts != null) {
|
||||
dnsUpdateSchedule.add(ref, ts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error processing DNS job for domain: " + ref, e);
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS job consumer interrupted", e);
|
||||
break;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error processing DNS job", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void syncAvailabilityJobs() {
|
||||
try {
|
||||
while (running) {
|
||||
|
||||
// If we are suspended, wait for resume
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if we need to refresh the availability data
|
||||
Instant nextRefresh = availabilityLastSync.plus(Duration.ofHours(24));
|
||||
if (Instant.now().isBefore(nextRefresh)) {
|
||||
Duration remaining = Duration.between(Instant.now(), nextRefresh);
|
||||
TimeUnit.MINUTES.sleep(Math.max(1, remaining.toMinutes()));
|
||||
continue;
|
||||
}
|
||||
|
||||
availabilityUpdateSchedule.replaceQueue(pingDao.getDomainUpdateSchedule(nid));
|
||||
availabilityLastSync = Instant.now();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching new ping jobs", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void syncDnsRecords() {
|
||||
try {
|
||||
while (running) {
|
||||
|
||||
Integer nid = nodeId;
|
||||
if (nid == null) {
|
||||
waitForResume();
|
||||
continue; // re-fetch the records after resuming
|
||||
}
|
||||
|
||||
// Check if we need to refresh the availability data
|
||||
Instant nextRefresh = dnsLastSync.plus(Duration.ofHours(24));
|
||||
if (Instant.now().isBefore(nextRefresh)) {
|
||||
Duration remaining = Duration.between(Instant.now(), nextRefresh);
|
||||
TimeUnit.MINUTES.sleep(Math.max(1, remaining.toMinutes()));
|
||||
continue;
|
||||
}
|
||||
|
||||
dnsUpdateSchedule.replaceQueue(pingDao.getDnsUpdateSchedule(nid));
|
||||
dnsLastSync = Instant.now();
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS job fetch interrupted", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,102 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.Security;
|
||||
|
||||
public class PingMain extends ProcessMainClass {
|
||||
private static final Logger log = LoggerFactory.getLogger(PingMain.class);
|
||||
|
||||
private final PingJobScheduler pingJobScheduler;
|
||||
private final int node;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingMain.class);
|
||||
|
||||
@Inject
|
||||
public PingMain(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
Gson gson,
|
||||
PingJobScheduler pingJobScheduler,
|
||||
ProcessConfiguration processConfiguration
|
||||
) {
|
||||
super(messageQueueFactory, config, gson, ProcessInboxNames.PING_INBOX);
|
||||
|
||||
this.pingJobScheduler = pingJobScheduler;
|
||||
this.node = processConfiguration.node();
|
||||
}
|
||||
|
||||
public void runPrimary() {
|
||||
log.info("Starting PingMain...");
|
||||
|
||||
// Start the ping job scheduler
|
||||
pingJobScheduler.start();
|
||||
pingJobScheduler.enableForNode(node);
|
||||
|
||||
log.info("PingMain started successfully.");
|
||||
}
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||
|
||||
// This must run *early*
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
||||
// If these aren't set properly, the JVM will hang forever on some requests
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new PingModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new ProcessConfigurationModule("ping"),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
|
||||
|
||||
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
|
||||
|
||||
PingMain main = injector.getInstance(PingMain.class);
|
||||
|
||||
var instructions = main.fetchInstructions(PingRequest.class);
|
||||
|
||||
try {
|
||||
main.runPrimary();
|
||||
for(;;)
|
||||
synchronized (main) { // Wait on the object lock to avoid busy-looping
|
||||
main.wait();
|
||||
}
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.error("Error running ping process", ex);
|
||||
instructions.err();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,82 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.ping.io.HttpClientProvider;
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PingModule extends AbstractModule {
|
||||
|
||||
public PingModule() throws NoSuchAlgorithmException {
|
||||
}
|
||||
|
||||
public static PingIntervalsConfiguration createPingIntervalsConfiguration() {
|
||||
Map<ErrorClassification, Duration> initialTimeouts = new HashMap<>();
|
||||
Map<ErrorClassification, Duration> maxTimeouts = new HashMap<>();
|
||||
|
||||
for (var classification : ErrorClassification.values()) {
|
||||
switch (classification) {
|
||||
case CONNECTION_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(15));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case HTTP_CLIENT_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(15));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case HTTP_SERVER_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(8));
|
||||
maxTimeouts.put(classification, Duration.ofHours(6));
|
||||
}
|
||||
case SSL_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(45));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case DNS_ERROR -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(60));
|
||||
maxTimeouts.put(classification, Duration.ofDays(7));
|
||||
}
|
||||
case TIMEOUT -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(5));
|
||||
maxTimeouts.put(classification, Duration.ofHours(6));
|
||||
}
|
||||
case UNKNOWN -> {
|
||||
initialTimeouts.put(classification, Duration.ofMinutes(30));
|
||||
maxTimeouts.put(classification, Duration.ofDays(1));
|
||||
}
|
||||
case NONE -> {
|
||||
initialTimeouts.put(classification, Duration.ofHours(6));
|
||||
maxTimeouts.put(classification, Duration.ofDays(6));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new PingIntervalsConfiguration(
|
||||
Duration.ofHours(3),
|
||||
initialTimeouts,
|
||||
maxTimeouts
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void configure() {
|
||||
bind(HttpClient.class).toProvider(HttpClientProvider.class);
|
||||
|
||||
bind(PingIntervalsConfiguration.class).toInstance(createPingIntervalsConfiguration());
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("ping.nameservers")
|
||||
public List<String> providePingNameservers() {
|
||||
// Google's public DNS servers currently have the best rate limiting
|
||||
return List.of("8.8.8.8", "8.8.4.4");
|
||||
}
|
||||
}
|
@@ -0,0 +1,109 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** In-memory schedule for updates, allowing jobs to be added and processed in order of their scheduled time.
|
||||
* This is not a particularly high-performance implementation, but exists to take contention off the database's
|
||||
* timestamp index.
|
||||
* */
|
||||
public class UpdateSchedule<T, T2> {
|
||||
private final PriorityQueue<UpdateJob<T, T2>> updateQueue;
|
||||
public record UpdateJob<T, T2>(T key, Instant updateTime) {}
|
||||
|
||||
public UpdateSchedule(int initialCapacity) {
|
||||
updateQueue = new PriorityQueue<>(initialCapacity, Comparator.comparing(UpdateJob::updateTime));
|
||||
}
|
||||
|
||||
public synchronized void add(T key, Instant updateTime) {
|
||||
updateQueue.add(new UpdateJob<>(key, updateTime));
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
/** Returns the next job in the queue that is due to be processed.
|
||||
* If no jobs are due, it will block until a job is added or a job becomes due.
|
||||
* */
|
||||
public synchronized T next() throws InterruptedException {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
wait(); // Wait for a new job to be added
|
||||
continue;
|
||||
}
|
||||
|
||||
UpdateJob<T, T2> job = updateQueue.peek();
|
||||
Instant now = Instant.now();
|
||||
|
||||
if (job.updateTime.isAfter(now)) {
|
||||
Duration toWait = Duration.between(now, job.updateTime);
|
||||
wait(Math.max(1, toWait.toMillis()));
|
||||
}
|
||||
else {
|
||||
updateQueue.poll(); // Remove the job from the queue since it's due
|
||||
return job.key();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Returns the first job in the queue matching the predicate that is not scheduled into the future,
|
||||
* blocking until a job is added or a job becomes due.
|
||||
*/
|
||||
public synchronized T nextIf(Predicate<T> predicate) throws InterruptedException {
|
||||
List<UpdateJob<T, T2>> rejectedJobs = new ArrayList<>();
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
if (updateQueue.isEmpty()) {
|
||||
wait(); // Wait for a new job to be added
|
||||
continue;
|
||||
}
|
||||
|
||||
UpdateJob<T, T2> job = updateQueue.peek();
|
||||
Instant now = Instant.now();
|
||||
|
||||
if (job.updateTime.isAfter(now)) {
|
||||
Duration toWait = Duration.between(now, job.updateTime);
|
||||
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
rejectedJobs.clear();
|
||||
|
||||
wait(Math.max(1, toWait.toMillis()));
|
||||
} else {
|
||||
var candidate = updateQueue.poll(); // Remove the job from the queue since it's due
|
||||
|
||||
assert candidate != null : "Update job should not be null at this point, since we just peeked it in a synchronized block";
|
||||
|
||||
if (!predicate.test(candidate.key())) {
|
||||
rejectedJobs.add(candidate);
|
||||
}
|
||||
else {
|
||||
return candidate.key();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally {
|
||||
// Return the rejected jobs to the queue for other threads to process
|
||||
updateQueue.addAll(rejectedJobs);
|
||||
if (!rejectedJobs.isEmpty())
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public synchronized void clear() {
|
||||
updateQueue.clear();
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
public synchronized void replaceQueue(Collection<UpdateJob<T,T2>> newJobs) {
|
||||
updateQueue.clear();
|
||||
updateQueue.addAll(newJobs);
|
||||
notifyAll();
|
||||
}
|
||||
}
|
@@ -0,0 +1,96 @@
|
||||
package nu.marginalia.ping.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.ping.model.SingleDnsRecord;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xbill.DNS.ExtendedResolver;
|
||||
import org.xbill.DNS.Lookup;
|
||||
import org.xbill.DNS.TextParseException;
|
||||
import org.xbill.DNS.Type;
|
||||
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
public class PingDnsFetcher {
|
||||
private final ThreadLocal<ExtendedResolver> resolver;
|
||||
private static final ExecutorService digExecutor = Executors.newFixedThreadPool(100);
|
||||
private static final Logger logger = LoggerFactory.getLogger(PingDnsFetcher.class);
|
||||
|
||||
private static final int[] RECORD_TYPES = {
|
||||
Type.A, Type.AAAA, Type.NS, Type.MX, Type.TXT,
|
||||
Type.SOA, Type.CNAME, Type.CAA, Type.SPF
|
||||
};
|
||||
|
||||
@Inject
|
||||
public PingDnsFetcher(@Named("ping.nameservers")
|
||||
List<String> nameservers) {
|
||||
resolver = ThreadLocal.withInitial(() -> createResolver(nameservers));
|
||||
}
|
||||
|
||||
private ExtendedResolver createResolver(List<String> nameservers) {
|
||||
try {
|
||||
ExtendedResolver r = new ExtendedResolver(
|
||||
nameservers.toArray(new String[0])
|
||||
);
|
||||
r.setLoadBalance(true);
|
||||
r.setTimeout(Duration.ofSeconds(5));
|
||||
return r;
|
||||
}
|
||||
catch (UnknownHostException e) {
|
||||
throw new RuntimeException("Failed to create DNS resolver", e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<SingleDnsRecord> query(String domainName, int recordType) throws TextParseException {
|
||||
var resolver = this.resolver.get();
|
||||
var query = new Lookup(domainName, recordType);
|
||||
query.setResolver(resolver);
|
||||
|
||||
var result = query.run();
|
||||
|
||||
if (result == null || result.length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
List<SingleDnsRecord> records = new ArrayList<>(result.length);
|
||||
|
||||
for (var record : result) {
|
||||
if (record == null) continue;
|
||||
records.add(new SingleDnsRecord(
|
||||
Type.string(recordType),
|
||||
record.toString())
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
public List<SingleDnsRecord> dig(String domainName) {
|
||||
List<Callable<List<SingleDnsRecord>>> tasks = new ArrayList<>(RECORD_TYPES.length);
|
||||
for (var recordType : RECORD_TYPES) {
|
||||
tasks.add(() -> query(domainName, recordType));
|
||||
}
|
||||
List<SingleDnsRecord> results = new ArrayList<>(RECORD_TYPES.length);
|
||||
try {
|
||||
List<Future<List<SingleDnsRecord>>> futures = digExecutor.invokeAll(tasks);
|
||||
for (Future<List<SingleDnsRecord>> future : futures) {
|
||||
try {
|
||||
results.addAll(future.get(1, TimeUnit.MINUTES));
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching DNS records", e);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.error("DNS query interrupted", e);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,94 @@
|
||||
package nu.marginalia.ping.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.ping.fetcher.response.*;
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.protocol.HttpClientContext;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PingHttpFetcher {
|
||||
private final UserAgent userAgent = WmsaHome.getUserAgent();
|
||||
private final HttpClient client;
|
||||
|
||||
@Inject
|
||||
public PingHttpFetcher(HttpClient client) {
|
||||
this.client = client;
|
||||
}
|
||||
|
||||
public PingRequestResponse fetchUrl(String url, Method method, String etag, String lastModified) {
|
||||
|
||||
var builder = ClassicRequestBuilder.create(method.name())
|
||||
.setUri(url)
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.addHeader("User-Agent", userAgent.uaString())
|
||||
.addHeader("Accept-Encoding", "gzip");
|
||||
if (etag != null) {
|
||||
builder.addHeader("If-None-Match", etag);
|
||||
}
|
||||
if (lastModified != null) {
|
||||
builder.addHeader("If-Modified-Since", lastModified);
|
||||
}
|
||||
|
||||
var req = builder.build();
|
||||
|
||||
HttpClientContext context = HttpClientContext.create();
|
||||
try {
|
||||
Instant start = Instant.now();
|
||||
return client.execute(req, context, (rsp) -> {
|
||||
|
||||
var entity = rsp.getEntity();
|
||||
|
||||
try {
|
||||
|
||||
Header[] rawHeaders = rsp.getHeaders();
|
||||
Map<String, List<String>> headers = new HashMap<>(rawHeaders.length);
|
||||
for (Header header : rawHeaders) {
|
||||
headers.computeIfAbsent(header.getName(), k -> new ArrayList<>())
|
||||
.add(header.getValue());
|
||||
}
|
||||
|
||||
if (method == Method.GET && entity == null) {
|
||||
return new ProtocolError("GET request returned no content");
|
||||
}
|
||||
|
||||
byte[] body = entity != null ? EntityUtils.toByteArray(entity) : null;
|
||||
|
||||
Duration responseTime = Duration.between(start, Instant.now());
|
||||
|
||||
return PingRequestResponse.of(
|
||||
rsp.getVersion(),
|
||||
rsp.getCode(),
|
||||
body,
|
||||
headers,
|
||||
responseTime,
|
||||
context.getSSLSession()
|
||||
);
|
||||
} finally {
|
||||
EntityUtils.consume(entity);
|
||||
}
|
||||
});
|
||||
} catch (SocketTimeoutException ex) {
|
||||
return new TimeoutResponse(ex.getMessage());
|
||||
} catch (HttpHostConnectException e) {
|
||||
return new ConnectionError(e.getClass().getSimpleName());
|
||||
} catch (IOException e) {
|
||||
return new ProtocolError(e.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record ConnectionError(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,18 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public record Headers(Map<String, List<String>> headers) {
|
||||
public List<String> get(String name) {
|
||||
return headers.getOrDefault(name, List.of());
|
||||
}
|
||||
|
||||
public String getFirst(String name) {
|
||||
return headers.getOrDefault(name, List.of()).stream().findFirst().orElse(null);
|
||||
}
|
||||
|
||||
public boolean contains(String name) {
|
||||
return headers.containsKey(name);
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
public record HttpResponse(
|
||||
String version,
|
||||
int httpStatus,
|
||||
byte[] body,
|
||||
Headers headers,
|
||||
Duration httpResponseTime
|
||||
) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,15 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import java.security.cert.Certificate;
|
||||
import java.time.Duration;
|
||||
|
||||
public record HttpsResponse(
|
||||
String version,
|
||||
int httpStatus,
|
||||
byte[] body,
|
||||
Headers headers,
|
||||
Certificate[] sslCertificates,
|
||||
SslMetadata sslMetadata,
|
||||
Duration httpResponseTime
|
||||
) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public enum Method {
|
||||
GET, HEAD
|
||||
}
|
@@ -0,0 +1,22 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import org.apache.hc.core5.http.ProtocolVersion;
|
||||
|
||||
import javax.net.ssl.SSLPeerUnverifiedException;
|
||||
import javax.net.ssl.SSLSession;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public sealed interface PingRequestResponse
|
||||
permits HttpResponse, HttpsResponse, TimeoutResponse, ConnectionError, ProtocolError, UnknownHostError {
|
||||
static PingRequestResponse of(ProtocolVersion version, int httpStatus, byte[] body, Map<String, List<String>> headers, Duration time, SSLSession sslSession) throws SSLPeerUnverifiedException {
|
||||
|
||||
if (sslSession == null) {
|
||||
return new HttpResponse(version.toString(), httpStatus, body, new Headers(headers), time);
|
||||
} else {
|
||||
return new HttpsResponse(version.toString(), httpStatus, body, new Headers(headers), sslSession.getPeerCertificates(), new SslMetadata(sslSession), time);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record ProtocolError(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,14 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
import javax.net.ssl.SSLSession;
|
||||
|
||||
public record SslMetadata(
|
||||
String cipherSuite,
|
||||
String protocol) {
|
||||
public SslMetadata(SSLSession session) {
|
||||
this(
|
||||
session.getCipherSuite(),
|
||||
session.getProtocol()
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record TimeoutResponse(String errorMessage) implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ping.fetcher.response;
|
||||
|
||||
public record UnknownHostError() implements PingRequestResponse {
|
||||
}
|
@@ -0,0 +1,129 @@
|
||||
package nu.marginalia.ping.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||
import org.apache.hc.client5.http.ssl.NoopHostnameVerifier;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(
|
||||
new DefaultClientTlsStrategy(SSLContext.getDefault(), NoopHostnameVerifier.INSTANCE))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,75 @@
|
||||
package nu.marginalia.ping.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum AvailabilityOutageType {
|
||||
NONE,
|
||||
TIMEOUT,
|
||||
SSL_ERROR,
|
||||
DNS_ERROR,
|
||||
CONNECTION_ERROR,
|
||||
HTTP_CLIENT_ERROR,
|
||||
HTTP_SERVER_ERROR,
|
||||
UNKNOWN;
|
||||
|
||||
public static AvailabilityOutageType fromErrorClassification(ErrorClassification errorClassification) {
|
||||
if (null == errorClassification) {
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
return switch (errorClassification) {
|
||||
case NONE -> NONE;
|
||||
case TIMEOUT -> TIMEOUT;
|
||||
case SSL_ERROR -> SSL_ERROR;
|
||||
case DNS_ERROR -> DNS_ERROR;
|
||||
case CONNECTION_ERROR -> CONNECTION_ERROR;
|
||||
case HTTP_CLIENT_ERROR -> HTTP_CLIENT_ERROR;
|
||||
case HTTP_SERVER_ERROR -> HTTP_SERVER_ERROR;
|
||||
case UNKNOWN -> UNKNOWN;
|
||||
};
|
||||
}
|
||||
}
|
@@ -0,0 +1,49 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainAvailabilityEvent(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
boolean available,
|
||||
AvailabilityOutageType outageType, // e.g., 'TIMEOUT', 'DNS_ERROR', etc.
|
||||
Integer httpStatusCode, // Nullable, as it may not always be applicable
|
||||
String errorMessage, // Specific error details
|
||||
Instant tsUpdate // Timestamp of the last update
|
||||
) implements WritableModel {
|
||||
|
||||
@Override
|
||||
public void write(Connection conn) throws SQLException {
|
||||
try (var ps = conn.prepareStatement("""
|
||||
INSERT INTO DOMAIN_AVAILABILITY_EVENTS (
|
||||
domain_id,
|
||||
node_id,
|
||||
available,
|
||||
outage_type,
|
||||
http_status_code,
|
||||
error_message,
|
||||
ts_change
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
"""))
|
||||
{
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setBoolean(3, available());
|
||||
ps.setString(4, outageType().name());
|
||||
if (httpStatusCode() == null) {
|
||||
ps.setNull(5, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(5, httpStatusCode());
|
||||
}
|
||||
if (errorMessage() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, errorMessage());
|
||||
}
|
||||
ps.setTimestamp(7, java.sql.Timestamp.from(tsUpdate()));
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,361 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainAvailabilityRecord(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
boolean serverAvailable,
|
||||
@Nullable byte[] serverIp,
|
||||
@Nullable Integer asn,
|
||||
@Nullable Long dataHash,
|
||||
@Nullable Long securityConfigHash,
|
||||
@Nullable HttpSchema httpSchema,
|
||||
@Nullable String httpEtag,
|
||||
@Nullable String httpLastModified,
|
||||
@Nullable Integer httpStatus,
|
||||
@Nullable String httpLocation,
|
||||
@Nullable Duration httpResponseTime,
|
||||
@Nullable ErrorClassification errorClassification,
|
||||
@Nullable String errorMessage,
|
||||
|
||||
@Nullable Instant tsLastPing,
|
||||
@Nullable Instant tsLastAvailable,
|
||||
@Nullable Instant tsLastError,
|
||||
|
||||
Instant nextScheduledUpdate,
|
||||
int backoffConsecutiveFailures,
|
||||
Duration backoffFetchInterval
|
||||
)
|
||||
implements WritableModel
|
||||
{
|
||||
public DomainAvailabilityRecord(ResultSet rs) throws SQLException {
|
||||
this(
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.DOMAIN_ID"),
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.NODE_ID"),
|
||||
rs.getBoolean("DOMAIN_AVAILABILITY_INFORMATION.SERVER_AVAILABLE"),
|
||||
rs.getBytes("DOMAIN_AVAILABILITY_INFORMATION.SERVER_IP"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.SERVER_IP_ASN", Integer.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.DATA_HASH", Long.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.SECURITY_CONFIG_HASH", Long.class),
|
||||
httpSchemaFromString(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_SCHEMA", String.class)),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_ETAG"),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_LAST_MODIFIED"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_STATUS", Integer.class),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.HTTP_LOCATION"),
|
||||
durationFromMillis(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.HTTP_RESPONSE_TIME_MS", Integer.class)),
|
||||
errorClassificationFromString(rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.ERROR_CLASSIFICATION", String.class)),
|
||||
rs.getString("DOMAIN_AVAILABILITY_INFORMATION.ERROR_MESSAGE"),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_PING", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_AVAILABLE", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.TS_LAST_ERROR", Instant.class),
|
||||
rs.getObject("DOMAIN_AVAILABILITY_INFORMATION.NEXT_SCHEDULED_UPDATE", Instant.class),
|
||||
rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.BACKOFF_CONSECUTIVE_FAILURES"),
|
||||
Duration.ofSeconds(rs.getInt("DOMAIN_AVAILABILITY_INFORMATION.BACKOFF_FETCH_INTERVAL"))
|
||||
);
|
||||
}
|
||||
|
||||
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
|
||||
return schema == null ? null : HttpSchema.valueOf(schema);
|
||||
}
|
||||
private static ErrorClassification errorClassificationFromString(@Nullable String classification) {
|
||||
return classification == null ? null : ErrorClassification.valueOf(classification);
|
||||
}
|
||||
private static Duration durationFromMillis(@Nullable Integer millis) {
|
||||
return millis == null ? null : Duration.ofMillis(millis);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Instant nextUpdateTime() {
|
||||
return nextScheduledUpdate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement(
|
||||
"""
|
||||
REPLACE INTO DOMAIN_AVAILABILITY_INFORMATION (
|
||||
domain_id,
|
||||
node_id,
|
||||
server_available,
|
||||
server_ip,
|
||||
data_hash,
|
||||
security_config_hash,
|
||||
http_schema,
|
||||
http_etag,
|
||||
http_last_modified,
|
||||
http_status,
|
||||
http_location,
|
||||
http_response_time_ms,
|
||||
error_classification,
|
||||
error_message,
|
||||
ts_last_ping,
|
||||
ts_last_available,
|
||||
ts_last_error,
|
||||
next_scheduled_update,
|
||||
backoff_consecutive_failures,
|
||||
backoff_fetch_interval,
|
||||
server_ip_asn)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)
|
||||
""")) {
|
||||
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setBoolean(3, serverAvailable());
|
||||
if (serverIp() == null) {
|
||||
ps.setNull(4, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(4, serverIp());
|
||||
}
|
||||
if (dataHash() == null) {
|
||||
ps.setNull(5, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(5, dataHash());
|
||||
}
|
||||
if (securityConfigHash() == null) {
|
||||
ps.setNull(6, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(6, securityConfigHash());
|
||||
}
|
||||
if (httpSchema() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, httpSchema().name());
|
||||
}
|
||||
if (httpEtag() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, httpEtag());
|
||||
}
|
||||
if (httpLastModified() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, httpLastModified());
|
||||
}
|
||||
if (httpStatus() == null) {
|
||||
ps.setNull(10, java.sql.Types.INTEGER);
|
||||
}
|
||||
else {
|
||||
ps.setInt(10, httpStatus());
|
||||
}
|
||||
if (httpLocation() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(11, httpLocation());
|
||||
}
|
||||
|
||||
if (httpResponseTime() == null) {
|
||||
ps.setNull(12, java.sql.Types.SMALLINT);
|
||||
}
|
||||
else {
|
||||
ps.setInt(12, Math.clamp(httpResponseTime().toMillis(), 0, 0xFFFF)); // "unsigned short" in SQL
|
||||
}
|
||||
|
||||
if (errorClassification() == null) {
|
||||
ps.setNull(13, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(13, errorClassification().name());
|
||||
}
|
||||
|
||||
if (errorMessage() == null) {
|
||||
ps.setNull(14, java.sql.Types.VARCHAR);
|
||||
}
|
||||
else {
|
||||
ps.setString(14, errorMessage());
|
||||
}
|
||||
|
||||
ps.setTimestamp(15, java.sql.Timestamp.from(tsLastPing()));
|
||||
|
||||
if (tsLastAvailable() == null) {
|
||||
ps.setNull(16, java.sql.Types.TIMESTAMP);
|
||||
}
|
||||
else {
|
||||
ps.setTimestamp(16, java.sql.Timestamp.from(tsLastAvailable()));
|
||||
}
|
||||
if (tsLastError() == null) {
|
||||
ps.setNull(17, java.sql.Types.TIMESTAMP);
|
||||
}
|
||||
else {
|
||||
ps.setTimestamp(17, java.sql.Timestamp.from(tsLastError()));
|
||||
}
|
||||
|
||||
ps.setTimestamp(18, java.sql.Timestamp.from(nextScheduledUpdate()));
|
||||
ps.setInt(19, backoffConsecutiveFailures());
|
||||
ps.setInt(20, (int) backoffFetchInterval().getSeconds());
|
||||
|
||||
if (asn() == null) {
|
||||
ps.setNull(21, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(21, asn());
|
||||
}
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private int domainId;
|
||||
private int nodeId;
|
||||
private boolean serverAvailable;
|
||||
private byte[] serverIp;
|
||||
private Integer serverIpAsn;
|
||||
private Long dataHash;
|
||||
private Long securityConfigHash;
|
||||
private HttpSchema httpSchema;
|
||||
private String httpEtag;
|
||||
private String httpLastModified;
|
||||
private Integer httpStatus;
|
||||
private String httpLocation;
|
||||
private Duration httpResponseTime;
|
||||
private ErrorClassification errorClassification;
|
||||
private String errorMessage;
|
||||
private Instant tsLastPing;
|
||||
private Instant tsLastAvailable;
|
||||
private Instant tsLastError;
|
||||
private Instant nextScheduledUpdate;
|
||||
private int backoffConsecutiveFailures;
|
||||
private Duration backoffFetchInterval;
|
||||
|
||||
public Builder domainId(int domainId) {
|
||||
this.domainId = domainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeId(int nodeId) {
|
||||
this.nodeId = nodeId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverAvailable(boolean serverAvailable) {
|
||||
this.serverAvailable = serverAvailable;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverIp(byte[] serverIp) {
|
||||
this.serverIp = serverIp;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder serverIpAsn(Integer asn) {
|
||||
this.serverIpAsn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder dataHash(Long dataHash) {
|
||||
this.dataHash = dataHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder securityConfigHash(Long securityConfigHash) {
|
||||
this.securityConfigHash = securityConfigHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpSchema(HttpSchema httpSchema) {
|
||||
this.httpSchema = httpSchema;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpEtag(String httpEtag) {
|
||||
this.httpEtag = httpEtag;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpLastModified(String httpLastModified) {
|
||||
this.httpLastModified = httpLastModified;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpStatus(Integer httpStatus) {
|
||||
this.httpStatus = httpStatus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpLocation(String httpLocation) {
|
||||
this.httpLocation = httpLocation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpResponseTime(Duration httpResponseTime) {
|
||||
this.httpResponseTime = httpResponseTime;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder errorClassification(ErrorClassification errorClassification) {
|
||||
this.errorClassification = errorClassification;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder errorMessage(String errorMessage) {
|
||||
this.errorMessage = errorMessage;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastPing(Instant tsLastPing) {
|
||||
this.tsLastPing = tsLastPing;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastAvailable(Instant tsLastAvailable) {
|
||||
this.tsLastAvailable = tsLastAvailable;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastError(Instant tsLastError) {
|
||||
this.tsLastError = tsLastError;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nextScheduledUpdate(Instant nextScheduledUpdate) {
|
||||
this.nextScheduledUpdate = nextScheduledUpdate;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder backoffConsecutiveFailures(int backoffConsecutiveFailures) {
|
||||
this.backoffConsecutiveFailures = backoffConsecutiveFailures;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder backoffFetchInterval(Duration backoffFetchInterval) {
|
||||
this.backoffFetchInterval = backoffFetchInterval;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainAvailabilityRecord build() {
|
||||
return new DomainAvailabilityRecord(
|
||||
domainId,
|
||||
nodeId,
|
||||
serverAvailable,
|
||||
serverIp,
|
||||
serverIpAsn,
|
||||
dataHash,
|
||||
securityConfigHash,
|
||||
httpSchema,
|
||||
httpEtag,
|
||||
httpLastModified,
|
||||
httpStatus,
|
||||
httpLocation,
|
||||
httpResponseTime,
|
||||
errorClassification,
|
||||
errorMessage,
|
||||
tsLastPing,
|
||||
tsLastAvailable,
|
||||
tsLastError,
|
||||
nextScheduledUpdate,
|
||||
backoffConsecutiveFailures,
|
||||
backoffFetchInterval
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
}
|
@@ -0,0 +1,369 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public record DomainDnsRecord(
|
||||
Integer dnsRootDomainId,
|
||||
String rootDomainName,
|
||||
int nodeAffinity,
|
||||
@Nullable List<String> aRecords,
|
||||
@Nullable List<String> aaaaRecords,
|
||||
@Nullable String cnameRecord,
|
||||
@Nullable List<String> mxRecords,
|
||||
@Nullable List<String> caaRecords,
|
||||
@Nullable List<String> txtRecords,
|
||||
@Nullable List<String> nsRecords,
|
||||
@Nullable String soaRecord,
|
||||
Instant tsLastUpdate,
|
||||
Instant tsNextScheduledUpdate,
|
||||
int dnsCheckPriority)
|
||||
implements WritableModel
|
||||
{
|
||||
private static Gson gson = GsonFactory.get();
|
||||
|
||||
public DomainDnsRecord(ResultSet rs) throws SQLException {
|
||||
this(
|
||||
rs.getObject("DNS_ROOT_DOMAIN_ID", Integer.class),
|
||||
rs.getString("ROOT_DOMAIN_NAME"),
|
||||
rs.getInt("NODE_AFFINITY"),
|
||||
deserializeJsonArray(rs.getString("DNS_A_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_AAAA_RECORDS")),
|
||||
rs.getString("DNS_CNAME_RECORD"),
|
||||
deserializeJsonArray(rs.getString("DNS_MX_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_CAA_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_TXT_RECORDS")),
|
||||
deserializeJsonArray(rs.getString("DNS_NS_RECORDS")),
|
||||
rs.getString("DNS_SOA_RECORD"),
|
||||
rs.getObject("TS_LAST_DNS_CHECK", Instant.class),
|
||||
rs.getObject("TS_NEXT_DNS_CHECK", Instant.class),
|
||||
rs.getInt("DNS_CHECK_PRIORITY")
|
||||
);
|
||||
}
|
||||
|
||||
static List<String> deserializeJsonArray(@Nullable String json) {
|
||||
if (json == null || json.isEmpty()) {
|
||||
return List.of();
|
||||
}
|
||||
return gson.fromJson(json, List.class);
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Instant nextUpdateTime() {
|
||||
return tsNextScheduledUpdate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
|
||||
if (dnsRootDomainId() != null) {
|
||||
update(connection);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var ps = connection.prepareStatement("""
|
||||
REPLACE INTO DOMAIN_DNS_INFORMATION (
|
||||
ROOT_DOMAIN_NAME,
|
||||
NODE_AFFINITY,
|
||||
DNS_A_RECORDS,
|
||||
DNS_AAAA_RECORDS,
|
||||
DNS_CNAME_RECORD,
|
||||
DNS_MX_RECORDS,
|
||||
DNS_CAA_RECORDS,
|
||||
DNS_TXT_RECORDS,
|
||||
DNS_NS_RECORDS,
|
||||
DNS_SOA_RECORD,
|
||||
TS_LAST_DNS_CHECK,
|
||||
TS_NEXT_DNS_CHECK,
|
||||
DNS_CHECK_PRIORITY
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""")) {
|
||||
|
||||
ps.setString(1, rootDomainName());
|
||||
ps.setInt(2, nodeAffinity());
|
||||
|
||||
if (aRecords() == null) {
|
||||
ps.setNull(3, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(3, gson.toJson(aRecords()));
|
||||
}
|
||||
if (aaaaRecords() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, gson.toJson(aaaaRecords()));
|
||||
}
|
||||
if (cnameRecord() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, cnameRecord());
|
||||
}
|
||||
if (mxRecords() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, gson.toJson(mxRecords()));
|
||||
}
|
||||
if (caaRecords() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, gson.toJson(caaRecords()));
|
||||
}
|
||||
if (txtRecords() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, gson.toJson(txtRecords()));
|
||||
}
|
||||
if (nsRecords() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, gson.toJson(nsRecords()));
|
||||
}
|
||||
if (soaRecord() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, soaRecord());
|
||||
}
|
||||
ps.setString(10, soaRecord());
|
||||
ps.setTimestamp(11, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
ps.setTimestamp(12, java.sql.Timestamp.from(tsNextScheduledUpdate()));
|
||||
ps.setInt(13, dnsCheckPriority());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void update(Connection connection) throws SQLException {
|
||||
|
||||
try (var ps = connection.prepareStatement("""
|
||||
REPLACE INTO DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID,
|
||||
ROOT_DOMAIN_NAME,
|
||||
NODE_AFFINITY,
|
||||
DNS_A_RECORDS,
|
||||
DNS_AAAA_RECORDS,
|
||||
DNS_CNAME_RECORD,
|
||||
DNS_MX_RECORDS,
|
||||
DNS_CAA_RECORDS,
|
||||
DNS_TXT_RECORDS,
|
||||
DNS_NS_RECORDS,
|
||||
DNS_SOA_RECORD,
|
||||
TS_LAST_DNS_CHECK,
|
||||
TS_NEXT_DNS_CHECK,
|
||||
DNS_CHECK_PRIORITY
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
""")) {
|
||||
|
||||
ps.setObject(1, dnsRootDomainId(), java.sql.Types.INTEGER);
|
||||
ps.setString(2, rootDomainName());
|
||||
ps.setInt(3, nodeAffinity());
|
||||
|
||||
if (aRecords() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, gson.toJson(aRecords()));
|
||||
}
|
||||
if (aaaaRecords() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, gson.toJson(aaaaRecords()));
|
||||
}
|
||||
if (cnameRecord() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, cnameRecord());
|
||||
}
|
||||
if (mxRecords() == null) {
|
||||
ps.setNull(7, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(7, gson.toJson(mxRecords()));
|
||||
}
|
||||
if (caaRecords() == null) {
|
||||
ps.setNull(8, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(8, gson.toJson(caaRecords()));
|
||||
}
|
||||
if (txtRecords() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, gson.toJson(txtRecords()));
|
||||
}
|
||||
if (nsRecords() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, gson.toJson(nsRecords()));
|
||||
}
|
||||
if (soaRecord() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(11, soaRecord());
|
||||
}
|
||||
ps.setTimestamp(12, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
ps.setTimestamp(13, java.sql.Timestamp.from(tsNextScheduledUpdate()));
|
||||
ps.setInt(14, dnsCheckPriority());
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private Integer dnsRootDomainId;
|
||||
private String rootDomainName;
|
||||
private int nodeAffinity;
|
||||
private List<String> aRecords;
|
||||
private List<String> aaaaRecords;
|
||||
private String cnameRecord;
|
||||
private List<String> mxRecords;
|
||||
private List<String> caaRecords;
|
||||
private List<String> txtRecords;
|
||||
private List<String> nsRecords;
|
||||
private String soaRecord;
|
||||
private Instant tsLastUpdate;
|
||||
private Instant tsNextScheduledUpdate;
|
||||
private int dnsCheckPriority;
|
||||
|
||||
public Builder dnsRootDomainId(Integer dnsRootDomainId) {
|
||||
this.dnsRootDomainId = dnsRootDomainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder rootDomainName(String rootDomainName) {
|
||||
this.rootDomainName = rootDomainName;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeAffinity(int nodeAffinity) {
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addARecord(String aRecord) {
|
||||
if (this.aRecords == null) {
|
||||
this.aRecords = new ArrayList<>();
|
||||
}
|
||||
this.aRecords.add(aRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder aRecords(List<String> aRecords) {
|
||||
this.aRecords = aRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addAaaaRecord(String aaaaRecord) {
|
||||
if (this.aaaaRecords == null) {
|
||||
this.aaaaRecords = new ArrayList<>();
|
||||
}
|
||||
this.aaaaRecords.add(aaaaRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder aaaaRecords(List<String> aaaaRecords) {
|
||||
this.aaaaRecords = aaaaRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder cnameRecord(String cnameRecord) {
|
||||
this.cnameRecord = cnameRecord;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addMxRecord(String mxRecord) {
|
||||
if (this.mxRecords == null) {
|
||||
this.mxRecords = new ArrayList<>();
|
||||
}
|
||||
this.mxRecords.add(mxRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder mxRecords(List<String> mxRecords) {
|
||||
this.mxRecords = mxRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addCaaRecord(String caaRecord) {
|
||||
if (this.caaRecords == null) {
|
||||
this.caaRecords = new ArrayList<>();
|
||||
}
|
||||
this.caaRecords.add(caaRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder caaRecords(List<String> caaRecords) {
|
||||
this.caaRecords = caaRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addTxtRecord(String txtRecord) {
|
||||
if (this.txtRecords == null) {
|
||||
this.txtRecords = new ArrayList<>();
|
||||
}
|
||||
this.txtRecords.add(txtRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder txtRecords(List<String> txtRecords) {
|
||||
this.txtRecords = txtRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addNsRecord(String nsRecord) {
|
||||
if (this.nsRecords == null) {
|
||||
this.nsRecords = new ArrayList<>();
|
||||
}
|
||||
this.nsRecords.add(nsRecord);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nsRecords(List<String> nsRecords) {
|
||||
this.nsRecords = nsRecords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder soaRecord(String soaRecord) {
|
||||
this.soaRecord = soaRecord;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastUpdate(Instant tsLastUpdate) {
|
||||
this.tsLastUpdate = tsLastUpdate;
|
||||
return this;
|
||||
}
|
||||
public Builder tsNextScheduledUpdate(Instant nextScheduledUpdate) {
|
||||
this.tsNextScheduledUpdate = nextScheduledUpdate;
|
||||
return this;
|
||||
}
|
||||
public Builder dnsCheckPriority(int dnsCheckPriority) {
|
||||
this.dnsCheckPriority = dnsCheckPriority;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainDnsRecord build() {
|
||||
return new DomainDnsRecord(
|
||||
dnsRootDomainId,
|
||||
rootDomainName,
|
||||
nodeAffinity,
|
||||
aRecords,
|
||||
aaaaRecords,
|
||||
cnameRecord,
|
||||
mxRecords,
|
||||
caaRecords,
|
||||
txtRecords,
|
||||
nsRecords,
|
||||
soaRecord,
|
||||
tsLastUpdate,
|
||||
tsNextScheduledUpdate,
|
||||
dnsCheckPriority
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public record DomainReference(int domainId, int nodeId, String domainName) {
|
||||
public EdgeDomain asEdgeDomain() {
|
||||
return new EdgeDomain(domainName);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,91 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import nu.marginalia.ping.util.JsonObject;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
public record DomainSecurityEvent(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
Instant tsChange,
|
||||
boolean asnChanged,
|
||||
boolean certificateFingerprintChanged,
|
||||
boolean certificateProfileChanged,
|
||||
boolean certificateSanChanged,
|
||||
boolean certificatePublicKeyChanged,
|
||||
boolean certificateSerialNumberChanged,
|
||||
boolean certificateIssuerChanged,
|
||||
SchemaChange schemaChange,
|
||||
Duration oldCertificateTimeToExpiry,
|
||||
boolean securityHeadersChanged,
|
||||
boolean ipChanged,
|
||||
boolean softwareChanged,
|
||||
JsonObject<DomainSecurityRecord> securitySignatureBefore,
|
||||
JsonObject<DomainSecurityRecord> securitySignatureAfter
|
||||
) implements WritableModel {
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement("""
|
||||
INSERT INTO DOMAIN_SECURITY_EVENTS (
|
||||
domain_id,
|
||||
node_id,
|
||||
ts_change,
|
||||
change_asn,
|
||||
change_certificate_fingerprint,
|
||||
change_certificate_profile,
|
||||
change_certificate_san,
|
||||
change_certificate_public_key,
|
||||
change_security_headers,
|
||||
change_ip_address,
|
||||
change_software,
|
||||
old_cert_time_to_expiry,
|
||||
security_signature_before,
|
||||
security_signature_after,
|
||||
change_certificate_serial_number,
|
||||
change_certificate_issuer,
|
||||
change_schema
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
ps.setTimestamp(3, java.sql.Timestamp.from(tsChange()));
|
||||
ps.setBoolean(4, asnChanged());
|
||||
ps.setBoolean(5, certificateFingerprintChanged());
|
||||
ps.setBoolean(6, certificateProfileChanged());
|
||||
ps.setBoolean(7, certificateSanChanged());
|
||||
ps.setBoolean(8, certificatePublicKeyChanged());
|
||||
ps.setBoolean(9, securityHeadersChanged());
|
||||
ps.setBoolean(10, ipChanged());
|
||||
ps.setBoolean(11, softwareChanged());
|
||||
|
||||
if (oldCertificateTimeToExpiry() == null) {
|
||||
ps.setNull(12, java.sql.Types.BIGINT);
|
||||
} else {
|
||||
ps.setLong(12, oldCertificateTimeToExpiry().toHours());
|
||||
}
|
||||
|
||||
if (securitySignatureBefore() == null) {
|
||||
ps.setNull(13, java.sql.Types.BLOB);
|
||||
} else {
|
||||
ps.setBytes(13, securitySignatureBefore().compressed());
|
||||
}
|
||||
if (securitySignatureAfter() == null) {
|
||||
ps.setNull(14, java.sql.Types.BLOB);
|
||||
} else {
|
||||
ps.setBytes(14, securitySignatureAfter().compressed());
|
||||
}
|
||||
|
||||
ps.setBoolean(15, certificateSerialNumberChanged());
|
||||
ps.setBoolean(16, certificateIssuerChanged());
|
||||
ps.setString(17, schemaChange.name());
|
||||
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,543 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.time.Instant;
|
||||
import java.util.Objects;
|
||||
|
||||
public record DomainSecurityRecord(
|
||||
int domainId,
|
||||
int nodeId,
|
||||
@Nullable Integer asn,
|
||||
@Nullable HttpSchema httpSchema,
|
||||
@Nullable String httpVersion,
|
||||
@Nullable String httpCompression,
|
||||
@Nullable String httpCacheControl,
|
||||
@Nullable Instant sslCertNotBefore,
|
||||
@Nullable Instant sslCertNotAfter,
|
||||
@Nullable String sslCertIssuer,
|
||||
@Nullable String sslCertSubject,
|
||||
@Nullable byte[] sslCertPublicKeyHash,
|
||||
@Nullable String sslCertSerialNumber,
|
||||
@Nullable byte[] sslCertFingerprintSha256,
|
||||
@Nullable String sslCertSan,
|
||||
boolean sslCertWildcard,
|
||||
@Nullable String sslProtocol,
|
||||
@Nullable String sslCipherSuite,
|
||||
@Nullable String sslKeyExchange,
|
||||
@Nullable Integer sslCertificateChainLength,
|
||||
boolean sslCertificateValid,
|
||||
@Nullable String headerCorsAllowOrigin,
|
||||
boolean headerCorsAllowCredentials,
|
||||
@Nullable Integer headerContentSecurityPolicyHash,
|
||||
@Nullable String headerStrictTransportSecurity,
|
||||
@Nullable String headerReferrerPolicy,
|
||||
@Nullable String headerXFrameOptions,
|
||||
@Nullable String headerXContentTypeOptions,
|
||||
@Nullable String headerXXssProtection,
|
||||
@Nullable String headerServer,
|
||||
@Nullable String headerXPoweredBy,
|
||||
@Nullable Instant tsLastUpdate
|
||||
)
|
||||
implements WritableModel
|
||||
{
|
||||
|
||||
public int certificateProfileHash() {
|
||||
return Objects.hash(
|
||||
sslCertIssuer,
|
||||
sslCertSubject,
|
||||
sslCipherSuite,
|
||||
sslKeyExchange
|
||||
);
|
||||
}
|
||||
|
||||
public int securityHeadersHash() {
|
||||
return Objects.hash(
|
||||
headerCorsAllowOrigin,
|
||||
headerCorsAllowCredentials,
|
||||
headerContentSecurityPolicyHash,
|
||||
headerStrictTransportSecurity,
|
||||
headerReferrerPolicy,
|
||||
headerXFrameOptions,
|
||||
headerXContentTypeOptions,
|
||||
headerXXssProtection
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public DomainSecurityRecord(ResultSet rs) throws SQLException {
|
||||
this(rs.getInt("DOMAIN_SECURITY_INFORMATION.DOMAIN_ID"),
|
||||
rs.getInt("DOMAIN_SECURITY_INFORMATION.NODE_ID"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.ASN", Integer.class),
|
||||
httpSchemaFromString(rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_SCHEMA")),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_VERSION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_COMPRESSION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HTTP_CACHE_CONTROL"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERT_NOT_BEFORE", Instant.class),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERT_NOT_AFTER", Instant.class),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_ISSUER"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SUBJECT"),
|
||||
rs.getBytes("DOMAIN_SECURITY_INFORMATION.SSL_CERT_PUBLIC_KEY_HASH"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SERIAL_NUMBER"),
|
||||
rs.getBytes("DOMAIN_SECURITY_INFORMATION.SSL_CERT_FINGERPRINT_SHA256"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CERT_SAN"),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.SSL_CERT_WILDCARD"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_PROTOCOL"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_CIPHER_SUITE"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.SSL_KEY_EXCHANGE"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.SSL_CERTIFICATE_CHAIN_LENGTH", Integer.class),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.SSL_CERTIFICATE_VALID"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_CORS_ALLOW_ORIGIN"),
|
||||
rs.getBoolean("DOMAIN_SECURITY_INFORMATION.HEADER_CORS_ALLOW_CREDENTIALS"),
|
||||
rs.getInt("DOMAIN_SECURITY_INFORMATION.HEADER_CONTENT_SECURITY_POLICY_HASH"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_STRICT_TRANSPORT_SECURITY"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_REFERRER_POLICY"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_FRAME_OPTIONS"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_CONTENT_TYPE_OPTIONS"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_XSS_PROTECTION"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_SERVER"),
|
||||
rs.getString("DOMAIN_SECURITY_INFORMATION.HEADER_X_POWERED_BY"),
|
||||
rs.getObject("DOMAIN_SECURITY_INFORMATION.TS_LAST_UPDATE", Instant.class));
|
||||
}
|
||||
|
||||
private static HttpSchema httpSchemaFromString(@Nullable String schema) {
|
||||
return schema == null ? null : HttpSchema.valueOf(schema);
|
||||
}
|
||||
|
||||
private static SslCertRevocationStatus sslCertRevocationStatusFromString(@Nullable String status) {
|
||||
return status == null ? null : SslCertRevocationStatus.valueOf(status);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Connection connection) throws SQLException {
|
||||
try (var ps = connection.prepareStatement(
|
||||
"""
|
||||
REPLACE INTO DOMAIN_SECURITY_INFORMATION (
|
||||
domain_id,
|
||||
node_id,
|
||||
http_schema,
|
||||
http_version,
|
||||
http_compression,
|
||||
http_cache_control,
|
||||
ssl_cert_not_before,
|
||||
ssl_cert_not_after,
|
||||
ssl_cert_issuer,
|
||||
ssl_cert_subject,
|
||||
ssl_cert_serial_number,
|
||||
ssl_cert_fingerprint_sha256,
|
||||
ssl_cert_san,
|
||||
ssl_cert_wildcard,
|
||||
ssl_protocol,
|
||||
ssl_cipher_suite,
|
||||
ssl_key_exchange,
|
||||
ssl_certificate_chain_length,
|
||||
ssl_certificate_valid,
|
||||
header_cors_allow_origin,
|
||||
header_cors_allow_credentials,
|
||||
header_content_security_policy_hash,
|
||||
header_strict_transport_security,
|
||||
header_referrer_policy,
|
||||
header_x_frame_options,
|
||||
header_x_content_type_options,
|
||||
header_x_xss_protection,
|
||||
header_server,
|
||||
header_x_powered_by,
|
||||
ssl_cert_public_key_hash,
|
||||
asn,
|
||||
ts_last_update)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
"""))
|
||||
{
|
||||
ps.setInt(1, domainId());
|
||||
ps.setInt(2, nodeId());
|
||||
if (httpSchema() == null) {
|
||||
ps.setNull(3, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(3, httpSchema().name());
|
||||
}
|
||||
if (httpVersion() == null) {
|
||||
ps.setNull(4, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(4, httpVersion());
|
||||
}
|
||||
if (httpCompression() == null) {
|
||||
ps.setNull(5, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(5, httpCompression());
|
||||
}
|
||||
if (httpCacheControl() == null) {
|
||||
ps.setNull(6, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(6, httpCacheControl());
|
||||
}
|
||||
if (sslCertNotBefore() == null) {
|
||||
ps.setNull(7, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(7, java.sql.Timestamp.from(sslCertNotBefore()));
|
||||
}
|
||||
if (sslCertNotAfter() == null) {
|
||||
ps.setNull(8, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(8, java.sql.Timestamp.from(sslCertNotAfter()));
|
||||
}
|
||||
if (sslCertIssuer() == null) {
|
||||
ps.setNull(9, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(9, sslCertIssuer());
|
||||
}
|
||||
if (sslCertSubject() == null) {
|
||||
ps.setNull(10, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(10, sslCertSubject());
|
||||
}
|
||||
if (sslCertSerialNumber() == null) {
|
||||
ps.setNull(11, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(11, sslCertSerialNumber());
|
||||
}
|
||||
if (sslCertFingerprintSha256() == null) {
|
||||
ps.setNull(12, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(12, sslCertFingerprintSha256());
|
||||
}
|
||||
if (sslCertSan() == null) {
|
||||
ps.setNull(13, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(13, sslCertSan());
|
||||
}
|
||||
ps.setBoolean(14, sslCertWildcard());
|
||||
if (sslProtocol() == null) {
|
||||
ps.setNull(15, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(15, sslProtocol());
|
||||
}
|
||||
if (sslCipherSuite() == null) {
|
||||
ps.setNull(16, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(16, sslCipherSuite());
|
||||
}
|
||||
if (sslKeyExchange() == null) {
|
||||
ps.setNull(17, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(17, sslKeyExchange());
|
||||
}
|
||||
if (sslCertificateChainLength() == null) {
|
||||
ps.setNull(18, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(18, sslCertificateChainLength());
|
||||
}
|
||||
ps.setBoolean(19, sslCertificateValid());
|
||||
if (headerCorsAllowOrigin() == null) {
|
||||
ps.setNull(20, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(20, headerCorsAllowOrigin());
|
||||
}
|
||||
ps.setBoolean(21, headerCorsAllowCredentials());
|
||||
if (headerContentSecurityPolicyHash() == null) {
|
||||
ps.setNull(22, Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(22, headerContentSecurityPolicyHash());
|
||||
}
|
||||
if (headerStrictTransportSecurity() == null) {
|
||||
ps.setNull(23, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(23, headerStrictTransportSecurity());
|
||||
}
|
||||
if (headerReferrerPolicy() == null) {
|
||||
ps.setNull(24, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(24, headerReferrerPolicy());
|
||||
}
|
||||
if (headerXFrameOptions() == null) {
|
||||
ps.setNull(25, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(25, headerXFrameOptions());
|
||||
}
|
||||
if (headerXContentTypeOptions() == null) {
|
||||
ps.setNull(26, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(26, headerXContentTypeOptions());
|
||||
}
|
||||
if (headerXXssProtection() == null) {
|
||||
ps.setNull(27, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(27, headerXXssProtection());
|
||||
}
|
||||
if (headerServer() == null) {
|
||||
ps.setNull(28, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(28, headerServer());
|
||||
}
|
||||
if (headerXPoweredBy() == null) {
|
||||
ps.setNull(29, java.sql.Types.VARCHAR);
|
||||
} else {
|
||||
ps.setString(29, headerXPoweredBy());
|
||||
}
|
||||
if (sslCertPublicKeyHash() == null) {
|
||||
ps.setNull(30, java.sql.Types.BINARY);
|
||||
} else {
|
||||
ps.setBytes(30, sslCertPublicKeyHash());
|
||||
}
|
||||
if (asn() == null) {
|
||||
ps.setNull(31, java.sql.Types.INTEGER);
|
||||
} else {
|
||||
ps.setInt(31, asn());
|
||||
}
|
||||
|
||||
if (tsLastUpdate() == null) {
|
||||
ps.setNull(32, java.sql.Types.TIMESTAMP);
|
||||
} else {
|
||||
ps.setTimestamp(32, java.sql.Timestamp.from(tsLastUpdate()));
|
||||
}
|
||||
ps.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private int domainId;
|
||||
private int nodeId;
|
||||
private Integer asn;
|
||||
private HttpSchema httpSchema;
|
||||
private String httpVersion;
|
||||
private String httpCompression;
|
||||
private String httpCacheControl;
|
||||
private Instant sslCertNotBefore;
|
||||
private Instant sslCertNotAfter;
|
||||
private String sslCertIssuer;
|
||||
private String sslCertSubject;
|
||||
private String sslCertSerialNumber;
|
||||
private byte[] sslCertPublicKeyHash;
|
||||
private byte[] sslCertFingerprintSha256;
|
||||
private String sslCertSan;
|
||||
private boolean sslCertWildcard;
|
||||
private String sslProtocol;
|
||||
private String sslCipherSuite;
|
||||
private String sslKeyExchange;
|
||||
private Integer sslCertificateChainLength;
|
||||
private boolean sslCertificateValid;
|
||||
private String headerCorsAllowOrigin;
|
||||
private boolean headerCorsAllowCredentials;
|
||||
private Integer headerContentSecurityPolicyHash;
|
||||
private String headerStrictTransportSecurity;
|
||||
private String headerReferrerPolicy;
|
||||
private String headerXFrameOptions;
|
||||
private String headerXContentTypeOptions;
|
||||
private String headerXXssProtection;
|
||||
private String headerServer;
|
||||
private String headerXPoweredBy;
|
||||
private Instant tsLastUpdate;
|
||||
|
||||
public Builder() {
|
||||
// Default values for boolean fields
|
||||
this.sslCertWildcard = false;
|
||||
this.sslCertificateValid = false;
|
||||
this.headerCorsAllowCredentials = false;
|
||||
}
|
||||
|
||||
public Builder domainId(int domainId) {
|
||||
this.domainId = domainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder nodeId(int nodeId) {
|
||||
this.nodeId = nodeId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder asn(@Nullable Integer asn) {
|
||||
this.asn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpSchema(HttpSchema httpSchema) {
|
||||
this.httpSchema = httpSchema;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpVersion(String httpVersion) {
|
||||
this.httpVersion = StringUtils.truncate(httpVersion, 10);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpCompression(String httpCompression) {
|
||||
this.httpCompression = StringUtils.truncate(httpCompression, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder httpCacheControl(String httpCacheControl) {
|
||||
this.httpCacheControl = httpCacheControl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotBefore(Instant sslCertNotBefore) {
|
||||
this.sslCertNotBefore = sslCertNotBefore;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertNotAfter(Instant sslCertNotAfter) {
|
||||
this.sslCertNotAfter = sslCertNotAfter;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertIssuer(String sslCertIssuer) {
|
||||
this.sslCertIssuer = StringUtils.truncate(sslCertIssuer, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSubject(String sslCertSubject) {
|
||||
this.sslCertSubject = StringUtils.truncate(sslCertSubject, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSerialNumber(String sslCertSerialNumber) {
|
||||
this.sslCertSerialNumber = sslCertSerialNumber;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertPublicKeyHash(byte[] sslCertPublicKeyHash) {
|
||||
this.sslCertPublicKeyHash = sslCertPublicKeyHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertFingerprintSha256(byte[] sslCertFingerprintSha256) {
|
||||
this.sslCertFingerprintSha256 = sslCertFingerprintSha256;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertSan(String sslCertSan) {
|
||||
this.sslCertSan = sslCertSan;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertWildcard(boolean sslCertWildcard) {
|
||||
this.sslCertWildcard = sslCertWildcard;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslProtocol(String sslProtocol) {
|
||||
this.sslProtocol = sslProtocol;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCipherSuite(String sslCipherSuite) {
|
||||
this.sslCipherSuite = sslCipherSuite;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslKeyExchange(String sslKeyExchange) {
|
||||
this.sslKeyExchange = sslKeyExchange;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertificateChainLength(Integer sslCertificateChainLength) {
|
||||
this.sslCertificateChainLength = sslCertificateChainLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder sslCertificateValid(boolean sslCertificateValid) {
|
||||
this.sslCertificateValid = sslCertificateValid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerCorsAllowOrigin(String headerCorsAllowOrigin) {
|
||||
this.headerCorsAllowOrigin = headerCorsAllowOrigin;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerCorsAllowCredentials(boolean headerCorsAllowCredentials) {
|
||||
this.headerCorsAllowCredentials = headerCorsAllowCredentials;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerContentSecurityPolicyHash(Integer headerContentSecurityPolicyHash) {
|
||||
this.headerContentSecurityPolicyHash = headerContentSecurityPolicyHash;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerStrictTransportSecurity(String headerStrictTransportSecurity) {
|
||||
this.headerStrictTransportSecurity = StringUtils.truncate(headerStrictTransportSecurity, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerReferrerPolicy(String headerReferrerPolicy) {
|
||||
this.headerReferrerPolicy = StringUtils.truncate(headerReferrerPolicy, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXFrameOptions(String headerXFrameOptions) {
|
||||
this.headerXFrameOptions = StringUtils.truncate(headerXFrameOptions, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXContentTypeOptions(String headerXContentTypeOptions) {
|
||||
this.headerXContentTypeOptions = StringUtils.truncate(headerXContentTypeOptions, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXXssProtection(String headerXXssProtection) {
|
||||
this.headerXXssProtection = StringUtils.truncate(headerXXssProtection, 50);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerServer(String headerServer) {
|
||||
this.headerServer = StringUtils.truncate(headerServer, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder headerXPoweredBy(String headerXPoweredBy) {
|
||||
this.headerXPoweredBy = StringUtils.truncate(headerXPoweredBy, 255);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder tsLastUpdate(Instant tsLastUpdate) {
|
||||
this.tsLastUpdate = tsLastUpdate;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainSecurityRecord build() {
|
||||
return new DomainSecurityRecord(
|
||||
domainId,
|
||||
nodeId,
|
||||
asn,
|
||||
httpSchema,
|
||||
httpVersion,
|
||||
httpCompression,
|
||||
httpCacheControl,
|
||||
sslCertNotBefore,
|
||||
sslCertNotAfter,
|
||||
sslCertIssuer,
|
||||
sslCertSubject,
|
||||
sslCertPublicKeyHash,
|
||||
sslCertSerialNumber,
|
||||
sslCertFingerprintSha256,
|
||||
sslCertSan,
|
||||
sslCertWildcard,
|
||||
sslProtocol,
|
||||
sslCipherSuite,
|
||||
sslKeyExchange,
|
||||
sslCertificateChainLength,
|
||||
sslCertificateValid,
|
||||
headerCorsAllowOrigin,
|
||||
headerCorsAllowCredentials,
|
||||
headerContentSecurityPolicyHash,
|
||||
headerStrictTransportSecurity,
|
||||
headerReferrerPolicy,
|
||||
headerXFrameOptions,
|
||||
headerXContentTypeOptions,
|
||||
headerXXssProtection,
|
||||
headerServer,
|
||||
headerXPoweredBy,
|
||||
tsLastUpdate
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public enum ErrorClassification {
|
||||
NONE,
|
||||
TIMEOUT,
|
||||
SSL_ERROR,
|
||||
DNS_ERROR,
|
||||
CONNECTION_ERROR,
|
||||
HTTP_CLIENT_ERROR,
|
||||
HTTP_SERVER_ERROR,
|
||||
UNKNOWN
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.ping.model;
|
||||
|
||||
public sealed interface HistoricalAvailabilityData {
|
||||
public String domain();
|
||||
record JustDomainReference(DomainReference domainReference) implements HistoricalAvailabilityData {
|
||||
@Override
|
||||
public String domain() {
|
||||
return domainReference.domainName();
|
||||
}
|
||||
}
|
||||
record JustAvailability(String domain, DomainAvailabilityRecord record) implements HistoricalAvailabilityData {}
|
||||
record AvailabilityAndSecurity(String domain, DomainAvailabilityRecord availabilityRecord, DomainSecurityRecord securityRecord) implements HistoricalAvailabilityData {}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user