mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
129 Commits
deploy-018
...
deploy-026
Author | SHA1 | Date | |
---|---|---|---|
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 | ||
|
a0fe070fe7 | ||
|
abe9da0fc6 | ||
|
56d0128b0a | ||
|
840b68ac55 | ||
|
c34ff6d6c3 | ||
|
32780967d8 | ||
|
7330bc489d | ||
|
ea23f33738 | ||
|
4a8a028118 | ||
|
a25bc647be | ||
|
a720dba3a2 | ||
|
284f382867 | ||
|
a80717f138 | ||
|
d6da715fa4 | ||
|
c1ec7aa491 | ||
|
3daf37e283 | ||
|
44a774d3a8 | ||
|
597aeaf496 | ||
|
06df7892c2 | ||
|
dc26854268 | ||
|
9f16326cba | ||
|
ed66d0b3a7 | ||
|
c3afc82dad | ||
|
08e25e539e | ||
|
4946044dd0 | ||
|
edf382e1c5 | ||
|
644cba32e4 | ||
|
34b76390b2 | ||
|
43cd507971 | ||
|
cc40e99fdc | ||
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
be039d1a8c | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 | ||
|
599534806b | ||
|
7e8253dac7 | ||
|
97a6780ea3 | ||
|
eb634beec8 | ||
|
269ebd1654 | ||
|
39ce40bfeb |
16
ROADMAP.md
16
ROADMAP.md
@@ -38,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
||||
|
||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Custom ranking logic
|
||||
|
||||
Stract does an interesting thing where they have configurable search filters.
|
||||
@@ -66,6 +58,14 @@ One of the search engine's biggest limitations right now is that it does not ind
|
||||
|
||||
# Completed
|
||||
|
||||
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||
|
||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||
that direction as well.
|
||||
|
||||
## Web Design Overhaul (COMPLETED 2025-01)
|
||||
|
||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||
|
@@ -1,3 +1,8 @@
|
||||
package nu.marginalia;
|
||||
|
||||
/**
|
||||
* A record representing a User Agent.
|
||||
* @param uaString - the header value of the User Agent
|
||||
* @param uaIdentifier - what we look for in robots.txt
|
||||
*/
|
||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("ACCEPT_QUERIES"),
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(2, config.acceptQueries());
|
||||
us.setBoolean(3, config.autoClean());
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
us.setBoolean(5, config.autoAssignDomains());
|
||||
us.setBoolean(6, config.keepWarcs());
|
||||
us.setBoolean(7, config.disabled());
|
||||
us.setString(8, config.profile().name());
|
||||
us.setInt(9, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
||||
boolean acceptQueries,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean autoAssignDomains,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
return isBatchCrawl() || isMixed();
|
||||
}
|
||||
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
||||
assertEquals(2, list.size());
|
||||
assertEquals(a, list.get(0));
|
||||
assertEquals(b, list.get(1));
|
||||
}
|
||||
|
||||
|
||||
// Test all the fields that are only exposed via save()
|
||||
@Test
|
||||
public void testSaveChanges() throws SQLException {
|
||||
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, original.node());
|
||||
assertEquals("Test", original.description());
|
||||
assertFalse(original.acceptQueries());
|
||||
|
||||
var precession = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
!original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(precession);
|
||||
precession = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||
|
||||
var autoClean = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
!original.autoClean(),
|
||||
original.includeInPrecession(),
|
||||
original.autoAssignDomains(),
|
||||
original.keepWarcs(),
|
||||
original.profile(),
|
||||
original.disabled()
|
||||
);
|
||||
|
||||
nodeConfigurationService.save(autoClean);
|
||||
autoClean = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||
|
||||
var disabled = new NodeConfiguration(
|
||||
original.node(),
|
||||
"Foo",
|
||||
true,
|
||||
autoClean.autoClean(),
|
||||
autoClean.includeInPrecession(),
|
||||
autoClean.autoAssignDomains(),
|
||||
autoClean.keepWarcs(),
|
||||
autoClean.profile(),
|
||||
!autoClean.disabled()
|
||||
);
|
||||
nodeConfigurationService.save(disabled);
|
||||
disabled = nodeConfigurationService.get(original.node());
|
||||
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||
}
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||
ID INT NOT NULL AUTO_INCREMENT,
|
||||
TIER INT NOT NULL,
|
||||
PRIMARY KEY (ID)
|
||||
);
|
@@ -0,0 +1,213 @@
|
||||
|
||||
-- Create metadata tables for domain ping status and security information
|
||||
|
||||
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||
-- indicating ownership changes or security issues.
|
||||
|
||||
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||
|
||||
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||
|
||||
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||
|
||||
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||
|
||||
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||
|
||||
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||
|
||||
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||
|
||||
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||
|
||||
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||
|
||||
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||
|
||||
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||
|
||||
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||
|
||||
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||
|
||||
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||
|
||||
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||
DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||
|
||||
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||
|
||||
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||
|
||||
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||
NODE_ID INT NOT NULL,
|
||||
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
-- DNS change type flags
|
||||
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||
|
||||
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||
|
||||
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||
)
|
||||
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||
);
|
||||
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,7 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||
-- to make it easier to get more information about the SSL certificate's validity
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
||||
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||
-- to make it easier to make sense of certificate changes
|
||||
|
||||
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
||||
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||
-- be crawled.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||
PRIORITY INT NOT NULL DEFAULT 0,
|
||||
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
||||
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||
|
||||
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain;
|
||||
}
|
||||
|
||||
public String getDomainKey() {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
return topDomain;
|
||||
}
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||
return Instant.ofEpochMilli(json.getAsLong());
|
||||
} else {
|
||||
throw new JsonParseException("Expected a number for Instant");
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
|
@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class EdgeDomainTest {
|
||||
|
||||
@Test
|
||||
public void testSkepdic() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain.getDomain().getDomainKey());
|
||||
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
|
||||
assertEquals("skepdic", domain2.getDomain().getDomainKey());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHkDomain() throws URISyntaxException {
|
||||
var domain = new EdgeUrl("http://l7072i3.l7c.net");
|
||||
|
@@ -0,0 +1,59 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@Singleton
|
||||
public class ProcessEventLog {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||
|
||||
private final String serviceName;
|
||||
private final UUID instanceUuid;
|
||||
private final String serviceBase;
|
||||
|
||||
@Inject
|
||||
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||
this.instanceUuid = configuration.instanceUuid();
|
||||
this.serviceBase = configuration.processName();
|
||||
|
||||
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||
|
||||
logEvent("PCS-START", serviceName);
|
||||
}
|
||||
|
||||
public void logEvent(Class<?> type, String message) {
|
||||
logEvent(type.getSimpleName(), message);
|
||||
}
|
||||
public void logEvent(String type, String message) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, serviceName);
|
||||
stmt.setString(2, serviceBase);
|
||||
stmt.setString(3, instanceUuid.toString());
|
||||
stmt.setString(4, type);
|
||||
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to log event {}:{}", type, message);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,17 +1,21 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
* be discovered by other services on the network.
|
||||
*/
|
||||
@ImplementedBy(ZkServiceRegistry.class)
|
||||
public interface ServiceRegistryIf {
|
||||
/**
|
||||
* Register a service with the registry.
|
||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
||||
* </ul>
|
||||
* */
|
||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||
|
||||
void registerProcess(String processName, int nodeId);
|
||||
void deregisterProcess(String processName, int nodeId);
|
||||
|
||||
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.Watcher;
|
||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
.forPath("/running-instances");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(path);
|
||||
livenessPaths.add(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deregisterProcess(String processName, int nodeId) {
|
||||
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||
try {
|
||||
curatorFramework.delete().forPath(path);
|
||||
livenessPaths.remove(path);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||
if (stopped)
|
||||
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||
|
||||
String path = "/semaphores/" + name;
|
||||
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||
}
|
||||
|
||||
/* Exposed for tests */
|
||||
public synchronized void shutDown() {
|
||||
if (stopped)
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
private final MqPersistence persistence;
|
||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||
private final ServiceRegistryIf registry;
|
||||
|
||||
@Inject
|
||||
public ExecutorClient(ServiceRegistryIf registry,
|
||||
MqPersistence persistence,
|
||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
{
|
||||
this.registry = registry;
|
||||
this.persistence = persistence;
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void startFsm(int node, String actorName) {
|
||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||
.forNode(node)
|
||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public long updateNsfwFilters() throws Exception {
|
||||
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||
|
||||
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||
.forNode(1)
|
||||
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public ActorRunStates getActorStates(int node) {
|
||||
try {
|
||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
@@ -66,6 +68,9 @@ message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcUpdateNsfwFilters {
|
||||
int64 msgId = 1;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
|
@@ -19,6 +19,8 @@ dependencies {
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:ping-process')
|
||||
implementation project(':code:processes:new-domain-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
@@ -37,9 +39,9 @@ dependencies {
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:index:index-journal')
|
||||
|
@@ -6,12 +6,15 @@ import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
@@ -35,7 +38,8 @@ public enum ExecutorActor {
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME);
|
||||
UPDATE_RSS(NodeProfile.REALTIME)
|
||||
;
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
|
@@ -49,6 +49,8 @@ public class ExecutorActorControlService {
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
NdpMonitorActor ndpMonitorActor,
|
||||
PingMonitorActor pingMonitorActor,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
@@ -68,6 +70,7 @@ public class ExecutorActorControlService {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
@@ -88,9 +91,10 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
|
||||
@@ -109,6 +113,7 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public NdpMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.NDP_INBOX,
|
||||
ProcessService.ProcessId.NDP);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,181 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.actor.state.Terminal;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ping.PingRequest;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||
// but have to reimplement a lot of the same logic ourselves.
|
||||
@Singleton
|
||||
public class PingMonitorActor extends RecordActorPrototype {
|
||||
|
||||
private final MqPersistence persistence;
|
||||
private final ProcessService processService;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public static final int MAX_ATTEMPTS = 3;
|
||||
private final String inboxName;
|
||||
private final ProcessService.ProcessId processId;
|
||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||
private final int node;
|
||||
private final Gson gson;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||
public record Run(int attempts) implements ActorStep {}
|
||||
@Terminal
|
||||
public record Aborted() implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial i -> {
|
||||
PingRequest request = new PingRequest();
|
||||
persistence.sendNewMessage(inboxName, null, null,
|
||||
"PingRequest",
|
||||
gson.toJson(request),
|
||||
null);
|
||||
|
||||
yield new Monitor(0);
|
||||
}
|
||||
case Monitor(int errorAttempts) -> {
|
||||
for (;;) {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||
synchronized (processId) {
|
||||
processId.wait(5000);
|
||||
}
|
||||
|
||||
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||
yield new Monitor(0);
|
||||
}
|
||||
// else continue
|
||||
} else {
|
||||
// Special: Associate this thread with the message so that we can get tracking
|
||||
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||
|
||||
yield new Run(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
case Run(int attempts) -> {
|
||||
try {
|
||||
long startTime = System.currentTimeMillis();
|
||||
var exec = new TaskExecution();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
if (exec.isError()) {
|
||||
if (attempts < MAX_ATTEMPTS)
|
||||
yield new Run(attempts + 1);
|
||||
else
|
||||
yield new Error();
|
||||
}
|
||||
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||
// To avoid boot loops, we transition to error if the process
|
||||
// didn't run for longer than 1 seconds. This might happen if
|
||||
// the process crashes before it can reach the heartbeat and inbox
|
||||
// stages of execution. In this case it would not report having acted
|
||||
// on its message, and the process would be restarted forever without
|
||||
// the attempts counter incrementing.
|
||||
yield new Error("Process terminated within 1 seconds of starting");
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
// We get this exception when the process is cancelled by the user
|
||||
|
||||
processService.kill(processId);
|
||||
setCurrentMessageToDead();
|
||||
|
||||
yield new Aborted();
|
||||
}
|
||||
|
||||
yield new Monitor(attempts);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public PingMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) throws SQLException {
|
||||
super(gson);
|
||||
this.gson = gson;
|
||||
this.node = configuration.node();
|
||||
this.persistence = persistence;
|
||||
this.processService = processService;
|
||||
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||
this.processId = ProcessService.ProcessId.PING;
|
||||
}
|
||||
|
||||
/** Sets the message to dead in the database to avoid
|
||||
* the service respawning on the same task when we
|
||||
* re-enable this actor */
|
||||
private void setCurrentMessageToDead() {
|
||||
try {
|
||||
var messages = persistence.eavesdrop(inboxName, 1);
|
||||
|
||||
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||
return;
|
||||
|
||||
var theMessage = messages.iterator().next();
|
||||
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encapsulates the execution of the process in a separate thread so that
|
||||
* we can interrupt the thread if the process is cancelled */
|
||||
private class TaskExecution {
|
||||
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||
executorService.submit(() -> {
|
||||
try {
|
||||
processService.trigger(processId);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Error in triggering process", e);
|
||||
error.set(true);
|
||||
}
|
||||
}).get(); // Wait for the process to start
|
||||
}
|
||||
|
||||
public boolean isError() {
|
||||
return error.get();
|
||||
}
|
||||
}
|
||||
}
|
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
logger.info("{}", self);
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
yield new Monitor("-");
|
||||
|
@@ -0,0 +1,60 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Initial(long respondMsgId) implements ActorStep {}
|
||||
public record Run(long respondMsgId) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial(long respondMsgId) -> {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||
yield new Error("This actor can only run on node 1");
|
||||
}
|
||||
else {
|
||||
yield new Run(respondMsgId);
|
||||
}
|
||||
}
|
||||
case Run(long respondMsgId) -> {
|
||||
nsfwDomainFilter.fetchLists();
|
||||
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Sync NSFW filters";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public UpdateNsfwFiltersActor(Gson gson,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
}
|
@@ -10,6 +10,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
@@ -263,4 +264,19 @@ public class ExecutorGrpcService
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||
logger.info("Got request {}", request);
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to update nsfw filters", e);
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -8,6 +8,8 @@ import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.ndp.NdpMain;
|
||||
import nu.marginalia.ping.PingMain;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.task.ExportTasksMain;
|
||||
@@ -41,6 +43,7 @@ public class ProcessService {
|
||||
return switch (id) {
|
||||
case "converter" -> ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessId.CRAWLER;
|
||||
case "ping" -> ProcessId.PING;
|
||||
case "loader" -> ProcessId.LOADER;
|
||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||
@@ -50,10 +53,12 @@ public class ProcessService {
|
||||
|
||||
public enum ProcessId {
|
||||
CRAWLER(CrawlerMain.class),
|
||||
PING(PingMain.class),
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
NDP(NdpMain.class),
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
|
||||
@@ -68,6 +73,8 @@ public class ProcessService {
|
||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case PING -> "PING_PROCESS_OPTS";
|
||||
case NDP -> "NDP_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||
|
||||
final String q = """
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||
FROM EC_RANDOM_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||
WHERE STATE<2
|
||||
AND SERVER_AVAILABLE
|
||||
AND DOMAIN_SET=?
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY RAND()
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -59,6 +60,11 @@ public class FeedsClient {
|
||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||
}
|
||||
|
||||
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||
return channelPool.awaitChannel(duration);
|
||||
}
|
||||
|
||||
|
||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||
public String getFeedDataHash() {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||
|
@@ -22,18 +22,20 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':third-party:rssreader')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation project(':third-party:rssreader')
|
||||
implementation libs.opencsv
|
||||
implementation libs.slop
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
implementation libs.httpclient
|
||||
implementation libs.wiremock
|
||||
|
||||
implementation libs.prometheus
|
||||
@@ -57,8 +59,6 @@ dependencies {
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.domsample;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.livecapture.BrowserlessClient;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Duration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class DomSampleService {
|
||||
private final DomSampleDb db;
|
||||
private final HikariDataSource mariadbDataSource;
|
||||
private final URI browserlessURI;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomSampleService.class);
|
||||
|
||||
@Inject
|
||||
public DomSampleService(DomSampleDb db,
|
||||
HikariDataSource mariadbDataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
ServiceConfiguration serviceConfiguration)
|
||||
throws URISyntaxException
|
||||
{
|
||||
this.db = db;
|
||||
this.mariadbDataSource = mariadbDataSource;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
browserlessURI = null;
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
if (browserlessURI == null) {
|
||||
logger.warn("DomSampleService is not enabled due to missing browserless URI or multi-node configuration");
|
||||
return;
|
||||
}
|
||||
|
||||
Thread.ofPlatform().daemon().start(this::run);
|
||||
}
|
||||
|
||||
public void syncDomains() {
|
||||
Set<String> dbDomains = new HashSet<>();
|
||||
|
||||
logger.info("Fetching domains from database...");
|
||||
|
||||
try (var conn = mariadbDataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
""")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
dbDomains.add(rs.getString("DOMAIN_NAME"));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
logger.info("Found {} domains in database", dbDomains.size());
|
||||
|
||||
db.syncDomains(dbDomains);
|
||||
|
||||
logger.info("Synced domains to sqlite");
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
|
||||
try {
|
||||
// Grace sleep in case we're operating on an empty domain list
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
|
||||
syncDomains();
|
||||
var domains = db.getScheduledDomains();
|
||||
|
||||
for (var domain : domains) {
|
||||
updateDomain(client, domain);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
logger.info("DomSampleService interrupted, stopping...");
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
logger.error("Error in DomSampleService run loop", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void updateDomain(BrowserlessClient client, String domain) {
|
||||
var rootUrl = "https://" + domain + "/";
|
||||
try {
|
||||
var content = client.annotatedContent(rootUrl, new BrowserlessClient.GotoOptions("load", Duration.ofSeconds(10).toMillis()));
|
||||
|
||||
if (content.isPresent()) {
|
||||
db.saveSample(domain, rootUrl, content.get());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to process domain: " + domain, e);
|
||||
}
|
||||
finally {
|
||||
db.flagDomainAsFetched(domain);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,174 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class DomSampleDb implements AutoCloseable {
|
||||
private static final String dbFileName = "dom-sample.db";
|
||||
private final Connection connection;
|
||||
|
||||
public DomSampleDb() throws SQLException{
|
||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||
}
|
||||
|
||||
public DomSampleDb(Path dbPath) throws SQLException {
|
||||
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||
|
||||
connection = DriverManager.getConnection(dbUrl);
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS samples (url TEXT PRIMARY KEY, domain TEXT, sample BLOB, requests BLOB, accepted_popover BOOLEAN DEFAULT FALSE)");
|
||||
stmt.executeUpdate("CREATE INDEX IF NOT EXISTS domain_index ON samples (domain)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS schedule (domain TEXT PRIMARY KEY, last_fetch TIMESTAMP DEFAULT NULL)");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void syncDomains(Set<String> domains) {
|
||||
Set<String> currentDomains = new HashSet<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
currentDomains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to sync domains", e);
|
||||
}
|
||||
|
||||
Set<String> toRemove = new HashSet<>(currentDomains);
|
||||
Set<String> toAdd = new HashSet<>(domains);
|
||||
|
||||
toRemove.removeAll(domains);
|
||||
toAdd.removeAll(currentDomains);
|
||||
|
||||
try (var removeStmt = connection.prepareStatement("DELETE FROM schedule WHERE domain = ?");
|
||||
var addStmt = connection.prepareStatement("INSERT OR IGNORE INTO schedule (domain) VALUES (?)")
|
||||
) {
|
||||
for (String domain : toRemove) {
|
||||
removeStmt.setString(1, domain);
|
||||
removeStmt.executeUpdate();
|
||||
}
|
||||
|
||||
for (String domain : toAdd) {
|
||||
addStmt.setString(1, domain);
|
||||
addStmt.executeUpdate();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to remove domains", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> getScheduledDomains() {
|
||||
List<String> domains = new ArrayList<>();
|
||||
try (var stmt = connection.prepareStatement("SELECT domain FROM schedule ORDER BY last_fetch IS NULL DESC, last_fetch ASC")) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
domains.add(rs.getString("domain"));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to get scheduled domains", e);
|
||||
}
|
||||
return domains;
|
||||
}
|
||||
|
||||
public void flagDomainAsFetched(String domain) {
|
||||
try (var stmt = connection.prepareStatement("INSERT OR REPLACE INTO schedule (domain, last_fetch) VALUES (?, CURRENT_TIMESTAMP)")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
throw new RuntimeException("Failed to flag domain as fetched", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
||||
|
||||
public List<Sample> getSamples(String domain) throws SQLException {
|
||||
List<Sample> samples = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT url, sample, requests, accepted_popover
|
||||
FROM samples
|
||||
WHERE domain = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
samples.add(
|
||||
new Sample(
|
||||
rs.getString("url"),
|
||||
domain,
|
||||
rs.getString("sample"),
|
||||
rs.getString("requests"),
|
||||
rs.getBoolean("accepted_popover")
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||
var doc = Jsoup.parse(rawContent);
|
||||
|
||||
var networkRequests = doc.getElementById("marginalia-network-requests");
|
||||
|
||||
boolean acceptedPopover = false;
|
||||
|
||||
StringBuilder requestTsv = new StringBuilder();
|
||||
if (networkRequests != null) {
|
||||
|
||||
acceptedPopover = !networkRequests.getElementsByClass("marginalia-agreed-cookies").isEmpty();
|
||||
|
||||
for (var request : networkRequests.getElementsByClass("network-request")) {
|
||||
String method = request.attr("data-method");
|
||||
String urlAttr = request.attr("data-url");
|
||||
String timestamp = request.attr("data-timestamp");
|
||||
|
||||
requestTsv
|
||||
.append(method)
|
||||
.append('\t')
|
||||
.append(timestamp)
|
||||
.append('\t')
|
||||
.append(urlAttr.replace('\n', ' '))
|
||||
.append("\n");
|
||||
}
|
||||
|
||||
networkRequests.remove();
|
||||
}
|
||||
|
||||
doc.body().removeAttr("id");
|
||||
|
||||
String sample = doc.html();
|
||||
|
||||
saveSampleRaw(domain, url, sample, requestTsv.toString().trim(), acceptedPopover);
|
||||
|
||||
}
|
||||
|
||||
public void saveSampleRaw(String domain, String url, String sample, String requests, boolean acceptedPopover) throws SQLException {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE
|
||||
INTO samples (domain, url, sample, requests, accepted_popover)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, url);
|
||||
stmt.setString(3, sample);
|
||||
stmt.setString(4, requests);
|
||||
stmt.setBoolean(5, acceptedPopover);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
}
|
@@ -8,10 +8,13 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
@@ -60,6 +63,42 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
/** Fetches content with a marginalia hack extension loaded that decorates the DOM with attributes for
|
||||
* certain CSS attributes, to be able to easier identify popovers and other nuisance elements.
|
||||
*/
|
||||
public Optional<String> annotatedContent(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"gotoOptions", gotoOptions,
|
||||
"waitForSelector", Map.of("selector", "#marginaliahack", "timeout", 15000)
|
||||
);
|
||||
|
||||
// Launch parameters for the browserless instance to load the extension
|
||||
Map<String, Object> launchParameters = Map.of(
|
||||
"args", List.of("--load-extension=/dom-export")
|
||||
);
|
||||
|
||||
String launchParametersStr = URLEncoder.encode(gson.toJson(launchParameters), StandardCharsets.UTF_8);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content?token="+BROWSERLESS_TOKEN+"&launch="+launchParametersStr))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch annotated content for {}, status {}", url, rsp.statusCode());
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
|
@@ -126,7 +126,6 @@ public class LiveCaptureGrpcService
|
||||
}
|
||||
else {
|
||||
EdgeDomain domain = domainNameOpt.get();
|
||||
String domainNameStr = domain.toString();
|
||||
|
||||
if (!isValidDomainForCapture(domain)) {
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
|
@@ -1,66 +0,0 @@
|
||||
package nu.marginalia.rss.svc;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("fetching:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
||||
}
|
||||
}
|
||||
}
|
@@ -5,6 +5,8 @@ import com.opencsv.CSVReader;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.executor.client.ExecutorClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
@@ -18,19 +20,36 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.Instant;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@@ -51,12 +70,15 @@ public class FeedFetcherService {
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
private final ExecutorClient executorClient;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final HttpClient httpClient;
|
||||
|
||||
private volatile boolean updating;
|
||||
|
||||
@Inject
|
||||
public FeedFetcherService(FeedDb feedDb,
|
||||
DomainCoordinator domainCoordinator,
|
||||
FileStorageService fileStorageService,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ServiceHeartbeat serviceHeartbeat,
|
||||
@@ -67,6 +89,84 @@ public class FeedFetcherService {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
this.executorClient = executorClient;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
httpClient = HttpClients.custom()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.setConnectionManager(connectionManager)
|
||||
.setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setConnectionManager(connectionManager)
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
public enum UpdateMode {
|
||||
@@ -82,13 +182,7 @@ public class FeedFetcherService {
|
||||
|
||||
|
||||
try (FeedDbWriter writer = feedDb.createWriter();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.connectTimeout(Duration.ofSeconds(15))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||
ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
|
||||
FeedJournal feedJournal = FeedJournal.create();
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||
) {
|
||||
@@ -132,8 +226,9 @@ public class FeedFetcherService {
|
||||
};
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
}
|
||||
@@ -212,7 +307,6 @@ public class FeedFetcherService {
|
||||
}
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||
HttpClient client,
|
||||
ExecutorService executorService,
|
||||
@Nullable String ifModifiedSinceDate,
|
||||
@Nullable String ifNoneMatchTag)
|
||||
@@ -220,59 +314,63 @@ public class FeedFetcherService {
|
||||
try {
|
||||
URI uri = new URI(feed.feedUrl());
|
||||
|
||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(uri)
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
;
|
||||
var requestBuilder = ClassicRequestBuilder.get(uri)
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.setHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||
// though since there are certain idiosyncrasies in server implementations,
|
||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
|
||||
} else if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
return httpClient.execute(requestBuilder.build(), rsp -> {
|
||||
try {
|
||||
logger.info("Code: {}, URL: {}", rsp.getCode(), uri);
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
switch (rsp.getCode()) {
|
||||
case 200 -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return new FetchResult.TransientError(); // No content to read, treat as transient error
|
||||
}
|
||||
byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
// Decode the response body based on the Content-Type header
|
||||
Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
String contentType = contentTypeHeader.getValue();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||
* its support for timeouts only applies to the time until response starts to be received,
|
||||
* and does not catch the case when the server starts to send data but then hangs.
|
||||
*/
|
||||
HttpResponse<byte[]> rs = executorService.submit(
|
||||
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||
.get(15, TimeUnit.SECONDS);
|
||||
// Grab the ETag header if it exists
|
||||
Header etagHeader = rsp.getFirstHeader("ETag");
|
||||
String newEtagValue = etagHeader == null ? null : etagHeader.getValue();
|
||||
|
||||
if (rs.statusCode() == 429) { // Too Many Requests
|
||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||
continue;
|
||||
}
|
||||
|
||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
||||
|
||||
return switch (rs.statusCode()) {
|
||||
case 200 -> {
|
||||
byte[] responseData = getResponseData(rs);
|
||||
|
||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
||||
return new FetchResult.Success(bodyText, newEtagValue);
|
||||
}
|
||||
case 304 -> {
|
||||
return new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
}
|
||||
case 404 -> {
|
||||
return new FetchResult.PermanentError(); // never try again
|
||||
}
|
||||
default -> {
|
||||
return new FetchResult.TransientError(); // we try again later
|
||||
}
|
||||
}
|
||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
||||
default -> new FetchResult.TransientError(); // we try again later
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new FetchResult.PermanentError(); // treat as permanent error
|
||||
}
|
||||
finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.debug("Error fetching feed", ex);
|
||||
@@ -281,19 +379,6 @@ public class FeedFetcherService {
|
||||
return new FetchResult.TransientError();
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
public sealed interface FetchResult {
|
||||
record Success(String value, String etag) implements FetchResult {}
|
||||
record NotModified() implements FetchResult {}
|
||||
|
@@ -0,0 +1,113 @@
|
||||
package nu.marginalia.domsample.db;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.shaded.org.apache.commons.io.FileUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomSampleDbTest {
|
||||
Path tempDir;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
tempDir = Files.createTempDirectory("test");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetUp() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to set up database: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSyncDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
assertEquals(Set.of("example.com", "test.com", "foobar.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("example.com", "test.com"));
|
||||
assertEquals(Set.of("example.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
db.syncDomains(Set.of("foobar.com", "test.com"));
|
||||
assertEquals(Set.of("foobar.com", "test.com"), new HashSet<>(db.getScheduledDomains()));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchDomains() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
|
||||
db.syncDomains(Set.of("example.com", "test.com", "foobar.com"));
|
||||
db.flagDomainAsFetched("example.com");
|
||||
db.flagDomainAsFetched("test.com");
|
||||
db.flagDomainAsFetched("foobar.com");
|
||||
assertEquals(List.of("example.com", "test.com", "foobar.com"), db.getScheduledDomains());
|
||||
db.flagDomainAsFetched("test.com");
|
||||
assertEquals(List.of("example.com", "foobar.com", "test.com"), db.getScheduledDomains());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to sync domains: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadSingle() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "requests data", true);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(1, samples.size());
|
||||
var sample = samples.getFirst();
|
||||
assertEquals("example.com", sample.domain());
|
||||
assertEquals("http://example.com/sample", sample.url());
|
||||
assertEquals("sample data", sample.sample());
|
||||
assertEquals("requests data", sample.requests());
|
||||
assertTrue(sample.acceptedPopover());
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void saveLoadTwo() {
|
||||
var dbPath = tempDir.resolve("test.db");
|
||||
try (var db = new DomSampleDb(dbPath)) {
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample", "sample data", "r1", true);
|
||||
db.saveSampleRaw("example.com", "http://example.com/sample2", "sample data2", "r2", false);
|
||||
var samples = db.getSamples("example.com");
|
||||
assertEquals(2, samples.size());
|
||||
|
||||
Map<String, String> samplesByUrl = new HashMap<>();
|
||||
for (var sample : samples) {
|
||||
samplesByUrl.put(sample.url(), sample.sample());
|
||||
}
|
||||
|
||||
assertEquals("sample data", samplesByUrl.get("http://example.com/sample"));
|
||||
assertEquals("sample data2", samplesByUrl.get("http://example.com/sample2"));
|
||||
}
|
||||
catch (Exception e) {
|
||||
fail("Failed to save/load sample: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
@@ -3,17 +3,21 @@ package nu.marginalia.livecapture;
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.domsample.db.DomSampleDb;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.images.PullPolicy;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@@ -22,9 +26,14 @@ import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
// Run gradle docker if this image is not available
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("marginalia-browserless"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withImagePullPolicy(PullPolicy.defaultPolicy())
|
||||
.withNetworkMode("bridge")
|
||||
.withLogConsumer(frame -> {
|
||||
System.out.print(frame.getUtf8String());
|
||||
})
|
||||
.withExposedPorts(3000);
|
||||
|
||||
static WireMockServer wireMockServer =
|
||||
@@ -34,6 +43,7 @@ public class BrowserlessClientTest {
|
||||
static String localIp;
|
||||
|
||||
static URI browserlessURI;
|
||||
static URI browserlessWssURI;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws IOException {
|
||||
@@ -44,6 +54,12 @@ public class BrowserlessClientTest {
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
browserlessWssURI = URI.create(String.format("ws://%s:%d/?token=BROWSERLESS_TOKEN",
|
||||
container.getHost(),
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||
|
||||
@@ -85,6 +101,30 @@ public class BrowserlessClientTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnnotatedContent() throws Exception {
|
||||
|
||||
try (var client = new BrowserlessClient(browserlessURI);
|
||||
DomSampleDb dbop = new DomSampleDb(Path.of("/tmp/dom-sample.db"))
|
||||
) {
|
||||
var content = client.annotatedContent("https://marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
dbop.saveSample("marginalia.nu", "https://marginalia.nu/", content);
|
||||
System.out.println(content);
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
|
||||
dbop.getSamples("marginalia.nu").forEach(sample -> {
|
||||
System.out.println("Sample URL: " + sample.url());
|
||||
System.out.println("Sample Content: " + sample.sample());
|
||||
System.out.println("Sample Requests: " + sample.requests());
|
||||
System.out.println("Accepted Popover: " + sample.acceptedPopover());
|
||||
});
|
||||
}
|
||||
finally {
|
||||
Files.deleteIfExists(Path.of("/tmp/dom-sample.db"));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.name.Names;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.rss.db.FeedDb;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
@@ -82,6 +84,7 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
||||
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
implementation libs.commons.lang3
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.notnull
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation project(':code:common:service')
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
@@ -0,0 +1,192 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
@Singleton
|
||||
public class NsfwDomainFilter {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final List<String> dangerLists;
|
||||
private final List<String> smutLists;
|
||||
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||
|
||||
public static final int NSFW_DISABLE = 0;
|
||||
public static final int NSFW_BLOCK_DANGER = 1;
|
||||
public static final int NSFW_BLOCK_SMUT = 2;
|
||||
|
||||
@Inject
|
||||
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||
@Named("nsfw.smutLists") List<String> smutLists
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.dangerLists = dangerLists;
|
||||
this.smutLists = smutLists;
|
||||
|
||||
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||
while (true) {
|
||||
sync();
|
||||
try {
|
||||
TimeUnit.HOURS.sleep(1);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break; // Exit the loop if interrupted
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public boolean isBlocked(int domainId, int tier) {
|
||||
if (tier == 0)
|
||||
return false;
|
||||
|
||||
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||
return true;
|
||||
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private synchronized void sync() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
int tier = rs.getInt("TIER");
|
||||
|
||||
switch (tier) {
|
||||
case 1 -> tier1.add(domainId);
|
||||
case 2 -> tier2.add(domainId);
|
||||
}
|
||||
}
|
||||
|
||||
this.blockedDomainIdsTier1 = tier1;
|
||||
this.blockedDomainIdsTier2 = tier2;
|
||||
|
||||
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to sync NSFW domain filter", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void fetchLists() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||
.build();
|
||||
var stmt = conn.createStatement();
|
||||
var insertStmt = conn.prepareStatement("INSERT IGNORE INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||
|
||||
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||
|
||||
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||
for (var dangerListUrl : dangerLists) {
|
||||
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedDangerList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||
for (var smutListUrl : smutLists) {
|
||||
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||
}
|
||||
|
||||
for (String domain : combinedSmutList) {
|
||||
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||
insertStmt.setString(2, domain);
|
||||
insertStmt.addBatch();
|
||||
insertStmt.execute();
|
||||
}
|
||||
|
||||
stmt.execute("""
|
||||
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||
""");
|
||||
stmt.execute("""
|
||||
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||
""");
|
||||
sync();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> fetchList(HttpClient client, String url) {
|
||||
|
||||
logger.info("Fetching NSFW domain list from {}", url);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(java.net.URI.create(url))
|
||||
.build();
|
||||
|
||||
try {
|
||||
if (url.endsWith(".gz")) {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
byte[] body = response.body();
|
||||
|
||||
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||
return reader.lines()
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} catch (Exception e) {
|
||||
logger.error("Error reading GZIP response from {}", url, e);
|
||||
}
|
||||
} else {
|
||||
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
if (response.statusCode() == 200) {
|
||||
|
||||
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||
.filter(StringUtils::isNotEmpty)
|
||||
.toList();
|
||||
} else {
|
||||
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||
}
|
||||
|
||||
|
||||
return List.of();
|
||||
}
|
||||
}
|
@@ -0,0 +1,30 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Provides;
|
||||
import jakarta.inject.Named;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class NsfwFilterModule extends AbstractModule {
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||
);
|
||||
}
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {}
|
||||
}
|
@@ -0,0 +1,108 @@
|
||||
package nu.marginalia.nsfw;
|
||||
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Provides;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.MariaDBContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
|
||||
@Tag("slow")
|
||||
@Testcontainers
|
||||
class NsfwDomainFilterTest extends AbstractModule {
|
||||
|
||||
@Container
|
||||
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||
.withDatabaseName("WMSA_prod")
|
||||
.withUsername("wmsa")
|
||||
.withPassword("wmsa")
|
||||
.withNetworkAliases("mariadb");
|
||||
|
||||
static HikariDataSource dataSource;
|
||||
static Path tempDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpDb() throws IOException {
|
||||
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||
|
||||
System.setProperty("system.homePath", tempDir.toString());
|
||||
|
||||
HikariConfig config = new HikariConfig();
|
||||
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||
config.setUsername("wmsa");
|
||||
config.setPassword("wmsa");
|
||||
|
||||
dataSource = new HikariDataSource(config);
|
||||
|
||||
TestMigrationLoader.flywayMigration(dataSource);
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||
) {
|
||||
|
||||
// Ensure the database is ready
|
||||
conn.createStatement().execute("SELECT 1");
|
||||
|
||||
stmt.setString(1, "www.google.com");
|
||||
stmt.setString(2, "google.com");
|
||||
stmt.executeUpdate();
|
||||
stmt.setString(1, "www.bing.com");
|
||||
stmt.setString(2, "bing.com");
|
||||
stmt.executeUpdate();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to connect to the database", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.dangerLists")
|
||||
public List<String> nsfwDomainLists1() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list1"
|
||||
);
|
||||
}
|
||||
|
||||
@Provides
|
||||
@Named("nsfw.smutLists")
|
||||
public List<String> nsfwDomainLists2() {
|
||||
return List.of(
|
||||
"https://downloads.marginalia.nu/test/list2.gz"
|
||||
);
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(HikariDataSource.class).toInstance(dataSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
var filter = Guice
|
||||
.createInjector(this)
|
||||
.getInstance(NsfwDomainFilter.class);
|
||||
|
||||
filter.fetchLists();
|
||||
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||
}
|
||||
|
||||
}
|
@@ -1,9 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.*;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(request.getHumanQuery());
|
||||
|
||||
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
|
||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||
builder.setHumanQuery(humanQuery);
|
||||
|
||||
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||
|
||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
|
||||
request.getSearchSetIdentifier(),
|
||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||
request.getPagination().getPage()
|
||||
);
|
||||
}
|
||||
@@ -327,6 +329,7 @@ public class QueryProtobufCodec {
|
||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||
.setSearchSetIdentifier(params.identifier())
|
||||
.setQueryStrategy(params.queryStrategy().name())
|
||||
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||
.build())
|
||||
|
@@ -0,0 +1,26 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
public enum NsfwFilterTier {
|
||||
OFF(0),
|
||||
DANGER(1),
|
||||
PORN_AND_GAMBLING(2);
|
||||
|
||||
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||
|
||||
NsfwFilterTier(int codedValue) {
|
||||
this.codedValue = codedValue;
|
||||
}
|
||||
|
||||
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||
if (tier.codedValue == codedValue) {
|
||||
return tier;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||
}
|
||||
|
||||
public int getCodedValue() {
|
||||
return codedValue;
|
||||
}
|
||||
}
|
@@ -25,10 +25,11 @@ public record QueryParams(
|
||||
String identifier,
|
||||
QueryStrategy queryStrategy,
|
||||
RpcTemporalBias.Bias temporalBias,
|
||||
NsfwFilterTier filterTier,
|
||||
int page
|
||||
)
|
||||
{
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
||||
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||
this(query, null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
@@ -43,6 +44,7 @@ public record QueryParams(
|
||||
identifier,
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
filterTier,
|
||||
1 // page
|
||||
);
|
||||
}
|
||||
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
||||
RpcTemporalBias temporalBias = 16;
|
||||
|
||||
RpcQsQueryPagination pagination = 17;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
/* Query service query response */
|
||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
||||
RpcQueryLimits queryLimits = 10;
|
||||
string queryStrategy = 11; // Named query configuration
|
||||
RpcResultRankingParameters parameters = 12;
|
||||
|
||||
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||
|
||||
enum NSFW_FILTER_TIER {
|
||||
NONE = 0;
|
||||
DANGER = 1;
|
||||
PORN_AND_GAMBLING = 2;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/* A tagged union encoding some limit on a field */
|
||||
message RpcSpecLimit {
|
||||
int32 value = 1;
|
||||
|
@@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation project(':code:index:query')
|
||||
|
@@ -11,6 +11,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.index.api.IndexClient;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -34,13 +35,16 @@ public class QueryGRPCService
|
||||
|
||||
|
||||
private final QueryFactory queryFactory;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
private final IndexClient indexClient;
|
||||
|
||||
@Inject
|
||||
public QueryGRPCService(QueryFactory queryFactory,
|
||||
NsfwDomainFilter nsfwDomainFilter,
|
||||
IndexClient indexClient)
|
||||
{
|
||||
this.queryFactory = queryFactory;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
|
||||
"NONE",
|
||||
QueryStrategy.AUTO,
|
||||
RpcTemporalBias.Bias.NONE,
|
||||
NsfwFilterTier.OFF,
|
||||
0), null).specs;
|
||||
}
|
||||
|
||||
|
@@ -17,6 +17,7 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:functions:nsfw-domain-filter')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -28,14 +30,26 @@ public class IndexClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private final NsfwDomainFilter nsfwDomainFilter;
|
||||
|
||||
Counter wmsa_index_query_count = Counter.build()
|
||||
.name("wmsa_nsfw_filter_result_count")
|
||||
.labelNames("tier")
|
||||
.help("Count of results filtered by NSFW tier")
|
||||
.register();
|
||||
|
||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||
|
||||
@Inject
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||
DomainBlacklistImpl blacklist,
|
||||
NsfwDomainFilter nsfwDomainFilter
|
||||
) {
|
||||
this.channelPool = channelPoolFactory.createMulti(
|
||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||
IndexApiGrpc::newBlockingStub);
|
||||
this.blacklist = blacklist;
|
||||
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||
}
|
||||
|
||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||
@@ -52,7 +66,7 @@ public class IndexClient {
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
|
||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||
|
||||
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||
|
||||
List<RpcDecoratedResultItem> results =
|
||||
@@ -74,7 +88,7 @@ public class IndexClient {
|
||||
}
|
||||
})
|
||||
.flatMap(List::stream)
|
||||
.filter(item -> !isBlacklisted(item))
|
||||
.filter(item -> !isBlacklisted(item, filterTier))
|
||||
.sorted(comparator)
|
||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||
.limit(pagination.pageSize)
|
||||
@@ -83,8 +97,23 @@ public class IndexClient {
|
||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||
}
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
||||
static String[] tierNames = {
|
||||
"OFF",
|
||||
"DANGER",
|
||||
"NSFW"
|
||||
};
|
||||
|
||||
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||
|
||||
if (blacklist.isBlacklisted(domainId)) {
|
||||
return true;
|
||||
}
|
||||
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
||||
|
||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||
for (var instance : journal.pages()) {
|
||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||
{
|
||||
|
32
code/libraries/domain-lock/build.gradle
Normal file
32
code/libraries/domain-lock/build.gradle
Normal file
@@ -0,0 +1,32 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
public class DefaultDomainPermits {
|
||||
|
||||
public static int defaultPermits(EdgeDomain domain) {
|
||||
return defaultPermits(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static int defaultPermits(String topDomain) {
|
||||
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return 16;
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return 8;
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return 8;
|
||||
if (topDomain.equals("github.io"))
|
||||
return 8;
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,17 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class DomainCoordinationModule extends AbstractModule {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DomainCoordinationModule.class);
|
||||
|
||||
public DomainCoordinationModule() {
|
||||
}
|
||||
|
||||
public void configure() {
|
||||
bind(DomainCoordinator.class).to(ZookeeperDomainCoordinator.class);
|
||||
}
|
||||
}
|
@@ -0,0 +1,13 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Optional;
|
||||
|
||||
public interface DomainCoordinator {
|
||||
DomainLock lockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException;
|
||||
Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException;
|
||||
boolean isLockableHint(EdgeDomain domain);
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
public interface DomainLock extends AutoCloseable {
|
||||
void close();
|
||||
}
|
@@ -1,16 +1,17 @@
|
||||
package nu.marginalia.crawl.logic;
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
||||
* crawling the same domain at the same time.
|
||||
*/
|
||||
public class DomainLocks {
|
||||
@Singleton
|
||||
public class LocalDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
@@ -24,13 +25,25 @@ public class DomainLocks {
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
return new LocalDomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1, timeout.toMillis(), TimeUnit.MILLISECONDS)) {
|
||||
return Optional.of(new LocalDomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
@@ -39,24 +52,7 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
if (topDomain.equals("wordpress.com"))
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
return new Semaphore(DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
@@ -71,15 +67,15 @@ public class DomainLocks {
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
public static class LocalDomainLock implements DomainLock {
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(Semaphore semaphore) {
|
||||
LocalDomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
public void close() {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
@@ -0,0 +1,116 @@
|
||||
package nu.marginalia.coordination;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||
import org.apache.curator.framework.recipes.locks.Lease;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class ZookeeperDomainCoordinator implements DomainCoordinator {
|
||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
||||
// relatively big, but should be manageable since the number of domains is limited to
|
||||
// a few hundred thousand typically.
|
||||
private final Map<String, InterProcessSemaphoreV2> locks = new ConcurrentHashMap<>();
|
||||
private final Map<String, Integer> waitCounts = new ConcurrentHashMap<>();
|
||||
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final int nodeId;
|
||||
|
||||
@Inject
|
||||
public ZookeeperDomainCoordinator(ServiceRegistryIf serviceRegistry, @Named("wmsa-system-node") int nodeId) {
|
||||
// Zookeeper-specific initialization can be done here if needed
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.nodeId = nodeId;
|
||||
}
|
||||
|
||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
return new ZkDomainLock(sem, sem.acquire());
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to acquire lock for domain: " + domain.topDomain, e);
|
||||
}
|
||||
finally {
|
||||
// Decrement or remove the wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return tryLockDomain(domain, Duration.ofSeconds(1)); // Underlying semaphore doesn't have a tryLock method, so we use a short timeout
|
||||
}
|
||||
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain, Duration timeout) throws InterruptedException {
|
||||
final String key = domain.topDomain.toLowerCase();
|
||||
var sem = locks.computeIfAbsent(key, this::createSemapore);
|
||||
|
||||
// Increment or add a wait count for the domain
|
||||
waitCounts.compute(key, (k,value) -> (value == null ? 1 : value + 1));
|
||||
try {
|
||||
var lease = sem.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS); // Acquire with timeout
|
||||
if (lease != null) {
|
||||
return Optional.of(new ZkDomainLock(sem, lease));
|
||||
}
|
||||
else {
|
||||
return Optional.empty(); // If we fail to acquire the lease, we return an empty optional
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
return Optional.empty(); // If we fail to acquire the lock, we return an empty optional
|
||||
}
|
||||
finally {
|
||||
waitCounts.compute(key, (k,value) -> (value == null || value <= 1) ? null : value - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private InterProcessSemaphoreV2 createSemapore(String topDomain){
|
||||
try {
|
||||
return serviceRegistry.getSemaphore(topDomain + ":" + nodeId, DefaultDomainPermits.defaultPermits(topDomain));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to get semaphore for domain: " + topDomain, e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
return !waitCounts.containsKey(domain.topDomain.toLowerCase());
|
||||
}
|
||||
|
||||
public static class ZkDomainLock implements DomainLock {
|
||||
private final InterProcessSemaphoreV2 semaphore;
|
||||
private final Lease lease;
|
||||
|
||||
ZkDomainLock(InterProcessSemaphoreV2 semaphore, Lease lease) {
|
||||
this.semaphore = semaphore;
|
||||
this.lease = lease;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
semaphore.returnLease(lease);
|
||||
}
|
||||
}
|
||||
}
|
@@ -15,6 +15,10 @@ dependencies {
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.opencsv
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@@ -1,5 +1,6 @@
|
||||
package nu.marginalia.geoip;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.geoip.sources.AsnMapping;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
@@ -10,6 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.net.InetAddress;
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class GeoIpDictionary {
|
||||
private volatile IP2LocationMapping ip2locMapping = null;
|
||||
private volatile AsnTable asnTable = null;
|
||||
@@ -76,7 +78,7 @@ public class GeoIpDictionary {
|
||||
}
|
||||
|
||||
public Optional<AsnTable.AsnInfo> getAsnInfo(int ipAddress) {
|
||||
if (null == asnTable) { // not loaded yet or failed to load
|
||||
if (null == asnMapping || null == asnTable) { // not loaded yet or failed to load
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
||||
"",
|
||||
body.getBytes(StandardCharsets.UTF_8),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
|
||||
float minFontWeight = Integer.MAX_VALUE;
|
||||
for (var word : line)
|
||||
{
|
||||
int i = 0;
|
||||
for (var textPosition : word.getTextPositions())
|
||||
{
|
||||
if (word.text.charAt(i++) == ' ') {
|
||||
continue;
|
||||
}
|
||||
// Skip empty text positions as they may have a different font
|
||||
if (word.text.isBlank()) continue;
|
||||
|
||||
var font = textPosition.getFont();
|
||||
if (font == null) continue;
|
||||
var descriptor = font.getFontDescriptor();
|
||||
|
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
||||
"",
|
||||
readClassPathFile(p.toString()).getBytes(),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
|
||||
));
|
||||
}
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||
}
|
||||
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
||||
@@ -58,6 +59,7 @@ dependencies {
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
@@ -10,9 +10,11 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
@@ -25,9 +27,12 @@ import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessEventLog;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -54,6 +59,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final UserAgent userAgent;
|
||||
private final ProcessHeartbeatImpl heartbeat;
|
||||
private final ProcessEventLog eventLog;
|
||||
private final DomainProber domainProber;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
@@ -61,9 +67,10 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
private final HikariDataSource dataSource;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final int node;
|
||||
private final ServiceRegistryIf serviceRegistry;
|
||||
private final SimpleBlockingThreadPool pool;
|
||||
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -84,6 +91,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
HttpFetcherImpl httpFetcher,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
ProcessEventLog eventLog,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
ProcessConfiguration processConfiguration,
|
||||
@@ -91,6 +99,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
WarcArchiverFactory warcArchiverFactory,
|
||||
HikariDataSource dataSource,
|
||||
DomainBlacklist blacklist,
|
||||
DomainCoordinator domainCoordinator,
|
||||
ServiceRegistryIf serviceRegistry,
|
||||
Gson gson) throws InterruptedException {
|
||||
|
||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||
@@ -98,6 +108,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.userAgent = userAgent;
|
||||
this.fetcher = httpFetcher;
|
||||
this.heartbeat = heartbeat;
|
||||
this.eventLog = eventLog;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
@@ -105,6 +116,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.dataSource = dataSource;
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
this.serviceRegistry = serviceRegistry;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
@@ -147,12 +160,18 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Injector injector = Guice.createInjector(
|
||||
new CrawlerModule(),
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
var crawler = injector.getInstance(CrawlerMain.class);
|
||||
|
||||
var instructions = crawler.fetchInstructions(nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||
|
||||
crawler.serviceRegistry.registerProcess("crawler", crawler.node);
|
||||
|
||||
try {
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawling started");
|
||||
var req = instructions.value();
|
||||
if (req.targetDomainName != null) {
|
||||
crawler.runForSingleDomain(req.targetDomainName, req.crawlStorage);
|
||||
@@ -160,11 +179,15 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
else {
|
||||
crawler.runForDatabaseDomains(req.crawlStorage);
|
||||
}
|
||||
crawler.eventLog.logEvent("CRAWLER-INFO", "Crawl completed successfully");
|
||||
instructions.ok();
|
||||
} catch (Exception ex) {
|
||||
logger.error("Crawler failed", ex);
|
||||
instructions.err();
|
||||
}
|
||||
finally {
|
||||
crawler.serviceRegistry.deregisterProcess("crawler", crawler.node);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(5);
|
||||
}
|
||||
@@ -433,7 +456,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
return domainCoordinator.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -444,7 +467,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
Optional<DomainLock> lock = domainCoordinator.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
@@ -452,7 +475,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
DomainLock domainLock = lock.get();
|
||||
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
@@ -52,6 +52,7 @@ import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.UnknownHostException;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
@@ -87,13 +88,14 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
@@ -183,6 +185,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
@@ -193,6 +197,8 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (KeyManagementException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
Instant date = Instant.now();
|
||||
Instant requestDate = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
Instant responseDate = Instant.now();
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(requestDate)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||
Duration.between(requestDate, responseDate),
|
||||
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(responseDate)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
|
@@ -115,9 +115,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
if (!robotsRules.isAllowed(probedUrl.toString())) {
|
||||
warcRecorder.flagAsRobotsTxtError(probedUrl);
|
||||
yield 1; // Nothing we can do here, we aren't allowed to fetch the root URL
|
||||
}
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, robotsRules, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
@@ -270,11 +274,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, SimpleRobotRules robotsRules, CrawlDelayTimer timer) {
|
||||
Optional<String> feedLink = Optional.empty();
|
||||
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
EdgeUrl url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
@@ -331,7 +335,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
|
||||
if (feedLink.isEmpty()) {
|
||||
feedLink = guessFeedUrl(timer);
|
||||
feedLink = guessFeedUrl(timer, robotsRules);
|
||||
}
|
||||
|
||||
// Download the sitemap if available
|
||||
@@ -339,14 +343,18 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
if (robotsRules.isAllowed(faviconUrl.toString())) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED)
|
||||
instanceof HttpFetchResult.ResultOk iconResult)
|
||||
{
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
}
|
||||
}
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
@@ -383,7 +391,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
"blog/rss"
|
||||
);
|
||||
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer, SimpleRobotRules robotsRules) throws InterruptedException {
|
||||
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||
|
||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||
@@ -396,6 +404,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
for (String endpoint : likelyFeedEndpoints) {
|
||||
String url = "https://" + domain + "/" + endpoint;
|
||||
if (!robotsRules.isAllowed(url)) {
|
||||
continue;
|
||||
}
|
||||
if (validateFeedUrl(url, timer)) {
|
||||
return Optional.of(url);
|
||||
}
|
||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
nextRecord.body,
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies,
|
||||
-1,
|
||||
lastModified,
|
||||
etag));
|
||||
}
|
||||
|
@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
nextRecord.body(),
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies(),
|
||||
nextRecord.requestTimeMs(),
|
||||
null,
|
||||
null));
|
||||
}
|
||||
|
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
|
||||
public String crawlerStatus;
|
||||
public String crawlerStatusDesc;
|
||||
public int requestTimeMs;
|
||||
|
||||
@Nullable
|
||||
public String headers;
|
||||
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
public String lastModifiedMaybe;
|
||||
public String etagMaybe;
|
||||
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
|
||||
this.crawlId = crawlId;
|
||||
this.url = url;
|
||||
this.contentType = contentType;
|
||||
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
||||
this.hasCookies = hasCookies;
|
||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
this.etagMaybe = etagMaybe;
|
||||
}
|
||||
|
||||
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
private byte[] documentBodyBytes = new byte[0];
|
||||
private String recrawlState;
|
||||
private Boolean hasCookies;
|
||||
private int requestTimeMs;
|
||||
private String lastModifiedMaybe;
|
||||
private String etagMaybe;
|
||||
|
||||
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocument build() {
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.primitive.ByteColumn;
|
||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.primitive.ShortColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
long timestamp,
|
||||
String contentType,
|
||||
byte[] body,
|
||||
int requestTimeMs,
|
||||
String headers)
|
||||
{
|
||||
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
||||
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
||||
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
||||
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
|
||||
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
|
||||
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
||||
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
parquetRecord.timestamp.toEpochMilli(),
|
||||
parquetRecord.contentType,
|
||||
parquetRecord.body,
|
||||
-1,
|
||||
parquetRecord.headers
|
||||
);
|
||||
}
|
||||
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=redirect",
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes(),
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
errorStatus,
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Writer timestampColumnWriter;
|
||||
private final EnumColumn.Writer contentTypeColumnWriter;
|
||||
private final ByteArrayColumn.Writer bodyColumnWriter;
|
||||
private final ShortColumn.Writer requestTimeColumnWriter;
|
||||
private final StringColumn.Writer headerColumnWriter;
|
||||
|
||||
public Writer(Path path) throws IOException {
|
||||
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter = timestampColumn.create(this);
|
||||
contentTypeColumnWriter = contentTypeColumn.create(this);
|
||||
bodyColumnWriter = bodyColumn.create(this);
|
||||
requestTimeColumnWriter = requestTimeColumn.create(this);
|
||||
headerColumnWriter = headerColumn.create(this);
|
||||
}
|
||||
|
||||
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnWriter.put(record.timestamp);
|
||||
contentTypeColumnWriter.put(record.contentType);
|
||||
bodyColumnWriter.put(record.body);
|
||||
requestTimeColumnWriter.put((short) record.requestTimeMs);
|
||||
headerColumnWriter.put(record.headers);
|
||||
}
|
||||
|
||||
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
String headersStr;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
int requestTimeMs = -1;
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||
hasCookies = true;
|
||||
}
|
||||
if (header.getName().equals("X-Marginalia-Response-Time")) {
|
||||
try {
|
||||
requestTimeMs = Integer.parseInt(header.getValue());
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
response.date().toEpochMilli(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
requestTimeMs,
|
||||
headersStr
|
||||
)
|
||||
);
|
||||
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
public Reader(Path path) throws IOException {
|
||||
@@ -475,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public SlopCrawlDataRecord get() throws IOException {
|
||||
@@ -487,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader.get(),
|
||||
contentTypeColumnReader.get(),
|
||||
bodyColumnReader.get(),
|
||||
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
|
||||
headerColumnReader.get()
|
||||
);
|
||||
}
|
||||
@@ -506,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
private SlopCrawlDataRecord next = null;
|
||||
@@ -522,6 +557,17 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
|
||||
// FIXME: After 2025-06-XX, we can remove this migration workaround
|
||||
ShortColumn.Reader timeColumnReader;
|
||||
try {
|
||||
timeColumnReader = requestTimeColumn.open(this);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Migration workaround
|
||||
timeColumnReader = null;
|
||||
}
|
||||
requestTimeColumnReader = timeColumnReader;
|
||||
}
|
||||
|
||||
public abstract boolean filter(String url, int status, String contentType);
|
||||
@@ -548,6 +594,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
boolean cookies = cookiesColumnReader.get() == 1;
|
||||
int status = statusColumnReader.get();
|
||||
long timestamp = timestampColumnReader.get();
|
||||
int requestTimeMs = requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1;
|
||||
String contentType = contentTypeColumnReader.get();
|
||||
|
||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||
@@ -555,7 +602,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
if (filter(url, status, contentType)) {
|
||||
next = new SlopCrawlDataRecord(
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
@@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
@@ -49,6 +50,7 @@ dependencies {
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation libs.httpclient
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
|
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
||||
headers,
|
||||
body,
|
||||
false,
|
||||
-1,
|
||||
"",
|
||||
""
|
||||
));
|
||||
|
@@ -10,9 +10,12 @@ import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||
import nu.marginalia.loading.documents.KeywordLoaderService;
|
||||
@@ -30,12 +33,15 @@ import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.Security;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.HashMap;
|
||||
@@ -58,6 +64,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
private final FileStorageService fileStorageService;
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
private final DocumentLoaderService documentLoaderService;
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
@@ -71,7 +78,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
DomainProcessor domainProcessor,
|
||||
FileStorageService fileStorageService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
DocumentLoaderService documentLoaderService, HikariDataSource dataSource)
|
||||
DocumentLoaderService documentLoaderService,
|
||||
DomainCoordinator domainCoordinator,
|
||||
HikariDataSource dataSource)
|
||||
throws Exception
|
||||
{
|
||||
super(messageQueueFactory, config, gson, LIVE_CRAWLER_INBOX);
|
||||
@@ -84,6 +93,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.keywordLoaderService = keywordLoaderService;
|
||||
this.documentLoaderService = documentLoaderService;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
@@ -107,6 +117,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
try {
|
||||
Injector injector = Guice.createInjector(
|
||||
new LiveCrawlerModule(),
|
||||
new DomainCoordinationModule(), // 2 hours lease timeout is enough for the live crawler
|
||||
new ProcessConfigurationModule("crawler"),
|
||||
new ConverterModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
@@ -143,7 +154,10 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
private void run() throws Exception {
|
||||
Path basePath = fileStorageService.getStorageBase(FileStorageBaseType.STORAGE).asPath().resolve("live-crawl-data");
|
||||
Path basePath = fileStorageService
|
||||
.getStorageBase(FileStorageBaseType.STORAGE)
|
||||
.asPath()
|
||||
.resolve("live-crawl-data");
|
||||
|
||||
if (!Files.isDirectory(basePath)) {
|
||||
Files.createDirectories(basePath);
|
||||
@@ -158,21 +172,38 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
{
|
||||
final Instant cutoff = Instant.now().minus(60, ChronoUnit.DAYS);
|
||||
|
||||
/* ------------------------------------------------ */
|
||||
/* Fetch the latest domains from the feeds database */
|
||||
/* ------------------------------------------------ */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.FETCH_LINKS);
|
||||
|
||||
Map<String, List<String>> urlsPerDomain = new HashMap<>(10_000);
|
||||
if (!feedsClient.waitReady(Duration.ofHours(1))) {
|
||||
throw new RuntimeException("Feeds client never became ready, cannot proceed with live crawling");
|
||||
}
|
||||
feedsClient.getUpdatedDomains(cutoff, urlsPerDomain::put);
|
||||
|
||||
logger.info("Fetched data for {} domains", urlsPerDomain.size());
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Prune the database from old entries */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PRUNE_DB);
|
||||
|
||||
// Remove data that is too old
|
||||
dataSet.prune(cutoff);
|
||||
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Fetch the links for each domain */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.CRAWLING);
|
||||
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainQueries, domainBlacklist);
|
||||
CloseableHttpClient client = HttpClientProvider.createClient();
|
||||
try (SimpleLinkScraper fetcher = new SimpleLinkScraper(dataSet, domainCoordinator, domainQueries, client, domainBlacklist);
|
||||
var hb = heartbeat.createAdHocTaskHeartbeat("Live Crawling"))
|
||||
{
|
||||
for (Map.Entry<String, List<String>> entry : hb.wrap("Fetching", urlsPerDomain.entrySet())) {
|
||||
@@ -185,18 +216,29 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
fetcher.scheduleRetrieval(domain, urls);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
client.close(CloseMode.GRACEFUL);
|
||||
}
|
||||
|
||||
Path tempPath = dataSet.createWorkDir();
|
||||
|
||||
|
||||
try {
|
||||
/* ------------------------------------- */
|
||||
/* Process the fetched links */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.PROCESSING);
|
||||
|
||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Processing");
|
||||
var writer = new ConverterBatchWriter(tempPath, 0)
|
||||
) {
|
||||
// Offset the documents' ordinals toward the upper range, to avoid an ID collisions with the
|
||||
// main indexes (the maximum permissible for doc ordinal is value is 67_108_863, so this
|
||||
// leaves us with a lot of headroom still)
|
||||
// We need unique document ids that do not collide with the document id from the main index,
|
||||
// so we offset the documents' ordinals toward the upper range.
|
||||
//
|
||||
// The maximum permissible for doc ordinal is value is 67_108_863,
|
||||
// so this leaves us with a lot of headroom still!
|
||||
// Expected document count here is order of 10 :^)
|
||||
writer.setOrdinalOffset(67_000_000);
|
||||
|
||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||
@@ -204,10 +246,15 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ---------------------------------------------- */
|
||||
/* Load the processed data into the link database */
|
||||
/* and construct an index journal for the docs */
|
||||
/* ---------------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.LOADING);
|
||||
|
||||
LoaderInputData lid = new LoaderInputData(tempPath, 1);
|
||||
|
||||
DomainIdRegistry domainIdRegistry = new DbDomainIdRegistry(dataSource);
|
||||
|
||||
keywordLoaderService.loadKeywords(domainIdRegistry, heartbeat, lid);
|
||||
@@ -219,9 +266,16 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
FileUtils.deleteDirectory(tempPath.toFile());
|
||||
}
|
||||
|
||||
// Construct the index
|
||||
|
||||
/* ------------------------------------- */
|
||||
/* Finish up */
|
||||
/* ------------------------------------- */
|
||||
|
||||
processHeartbeat.progress(LiveCrawlState.DONE);
|
||||
|
||||
// After we return from here, the LiveCrawlActor will trigger an index construction
|
||||
// job. Unlike all the stuff we did in this process, it's identical to the real job
|
||||
// so we don't need to do anything special from this process
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -5,8 +5,8 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.coordination.DomainLock;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
@@ -14,24 +14,21 @@ import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||
@@ -44,18 +41,21 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
private final LiveCrawlDataSet dataSet;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
private final HttpClient httpClient;
|
||||
|
||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||
DomainCoordinator domainCoordinator,
|
||||
DbDomainQueries domainQueries,
|
||||
HttpClient httpClient,
|
||||
DomainBlacklist domainBlacklist) {
|
||||
this.dataSet = dataSet;
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
this.domainQueries = domainQueries;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.httpClient = httpClient;
|
||||
}
|
||||
|
||||
public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
|
||||
@@ -72,17 +72,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>();
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>(Math.max(1, urls.size()));
|
||||
|
||||
// Resolve absolute URLs
|
||||
for (var url : urls) {
|
||||
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||
if (optParsedUrl.isEmpty()) {
|
||||
|
||||
if (optParsedUrl.isEmpty())
|
||||
continue;
|
||||
}
|
||||
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||
continue;
|
||||
}
|
||||
relevantUrls.add(optParsedUrl.get());
|
||||
|
||||
EdgeUrl absoluteUrl = optParsedUrl.get();
|
||||
|
||||
if (!dataSet.hasUrl(absoluteUrl))
|
||||
relevantUrls.add(absoluteUrl);
|
||||
}
|
||||
|
||||
if (relevantUrls.isEmpty()) {
|
||||
@@ -91,16 +93,10 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
|
||||
int fetched = 0;
|
||||
|
||||
try (HttpClient client = HttpClient
|
||||
.newBuilder()
|
||||
.connectTimeout(connectTimeout)
|
||||
.followRedirects(HttpClient.Redirect.NEVER)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain)
|
||||
try (// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLock lock = domainCoordinator.lockDomain(domain)
|
||||
) {
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl);
|
||||
|
||||
if (rules == null) { // I/O error fetching robots.txt
|
||||
// If we can't fetch the robots.txt,
|
||||
@@ -113,18 +109,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
||||
|
||||
for (var parsedUrl : relevantUrls) {
|
||||
|
||||
if (!rules.isAllowed(parsedUrl.toString())) {
|
||||
maybeFlagAsBad(parsedUrl);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||
switch (fetchUrl(domainId, parsedUrl, timer)) {
|
||||
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
|
||||
dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||
fetched++;
|
||||
}
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> {
|
||||
maybeFlagAsBad(docUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -147,111 +144,107 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
|
||||
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept-Encoding","gzip")
|
||||
.timeout(readTimeout);
|
||||
|
||||
// Fetch the robots.txt
|
||||
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl) throws URISyntaxException {
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
if (robotsTxt.statusCode() == 200) {
|
||||
return parser.parseContent(rootUrl.toString(),
|
||||
getResponseData(robotsTxt),
|
||||
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||
WmsaHome.getUserAgent().uaIdentifier());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
if (rsp.getEntity() == null) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
var contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (contentTypeHeader == null) {
|
||||
return null; // No content type header, can't parse
|
||||
}
|
||||
return new SimpleRobotRulesParser().parseContent(
|
||||
rootUrl.toString(),
|
||||
EntityUtils.toByteArray(rsp.getEntity()),
|
||||
contentTypeHeader.getValue(),
|
||||
WmsaHome.getUserAgent().uaIdentifier()
|
||||
);
|
||||
} else if (rsp.getCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
}
|
||||
} finally {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
return null;
|
||||
});
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching robots.txt for {}: {}", rootUrl, e.getMessage());
|
||||
return null; // I/O error fetching robots.txt
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
else if (robotsTxt.statusCode() == 404) {
|
||||
return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Error fetching robots.txt for {}: {} {}", rootUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Fetch a URL and store it in the database
|
||||
*/
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
||||
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer) throws Exception {
|
||||
|
||||
timer.waitFetchDelay();
|
||||
|
||||
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
||||
.GET()
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.header("Accept", "text/html")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(readTimeout)
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(parsedUrl.asURI())
|
||||
.setHeader("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||
.setHeader("Accept", "text/html")
|
||||
.setHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
try {
|
||||
HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
return httpClient.execute(request, rsp -> {
|
||||
try {
|
||||
if (rsp.getCode() == 200) {
|
||||
String contentType = rsp.getFirstHeader("Content-Type").getValue();
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
// Handle rate limiting by waiting and retrying once
|
||||
if (response.statusCode() == 429) {
|
||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||
response.headers().firstValue("Retry-After").orElse("5")
|
||||
));
|
||||
response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
}
|
||||
byte[] body = EntityUtils.toByteArray(rsp.getEntity(), MAX_SIZE);
|
||||
|
||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
if (response.statusCode() == 200) {
|
||||
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
for (var header : rsp.getHeaders()) {
|
||||
headersStr.append(header.getName()).append(": ").append(header.getValue()).append("\n");
|
||||
}
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersStr.toString());
|
||||
}
|
||||
} finally {
|
||||
if (rsp.getEntity() != null) {
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
}
|
||||
}
|
||||
|
||||
byte[] body = getResponseData(response);
|
||||
if (body.length > MAX_SIZE) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||
|
||||
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
|
||||
}
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
});
|
||||
}
|
||||
catch (IOException ex) {
|
||||
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
||||
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
catch (IOException e) {
|
||||
logger.error("Error fetching {}: {}", parsedUrl, e.getMessage());
|
||||
// If we can't fetch the URL, we return an error result
|
||||
// so that the caller can decide what to do with it.
|
||||
}
|
||||
finally {
|
||||
timer.waitFetchDelay();
|
||||
}
|
||||
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||
|
||||
if ("gzip".equals(encoding)) {
|
||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||
return stream.readAllBytes();
|
||||
}
|
||||
}
|
||||
else {
|
||||
return response.body();
|
||||
}
|
||||
}
|
||||
|
||||
sealed interface FetchResult {
|
||||
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||
record Error(EdgeUrl url) implements FetchResult {}
|
||||
}
|
||||
|
||||
private String headersToString(HttpHeaders headers) {
|
||||
StringBuilder headersStr = new StringBuilder();
|
||||
headers.map().forEach((k, v) -> {
|
||||
headersStr.append(k).append(": ").append(v).append("\n");
|
||||
});
|
||||
return headersStr.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
pool.shutDown();
|
||||
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.livecrawler.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -1,11 +1,15 @@
|
||||
package nu.marginalia.livecrawler;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.livecrawler.io.HttpClientProvider;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.core5.io.CloseMode;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -15,29 +19,34 @@ import org.mockito.Mockito;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
class SimpleLinkScraperTest {
|
||||
private Path tempDir;
|
||||
private LiveCrawlDataSet dataSet;
|
||||
private CloseableHttpClient httpClient;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, SQLException {
|
||||
public void setUp() throws IOException, SQLException, NoSuchAlgorithmException, KeyManagementException {
|
||||
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
dataSet = new LiveCrawlDataSet(tempDir);
|
||||
httpClient = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
dataSet.close();
|
||||
httpClient.close(CloseMode.IMMEDIATE);
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetrieveNow() throws Exception {
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(), null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(1, fetched);
|
||||
|
||||
@@ -57,7 +66,7 @@ class SimpleLinkScraperTest {
|
||||
@Test
|
||||
public void testRetrieveNow_Redundant() throws Exception {
|
||||
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
var scraper = new SimpleLinkScraper(dataSet, new LocalDomainCoordinator(),null, httpClient, Mockito.mock(DomainBlacklistImpl.class));
|
||||
|
||||
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
|
12
code/processes/new-domain-process/README.md
Normal file
12
code/processes/new-domain-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The new domain process (NDP) is a process that evaluates new domains for
|
||||
inclusion in the search engine index.
|
||||
|
||||
It visits the root document of each candidate domain, ensures that it's reachable,
|
||||
verifies that the response is valid HTML, and checks for a few factors such as length
|
||||
and links before deciding whether to assign the domain to a node.
|
||||
|
||||
The NDP process will assign new domains to the node with the fewest assigned domains.
|
||||
|
||||
The NDP process is triggered with a goal target number of domains to process, and
|
||||
will find domains until that target is reached. If e.g. a goal of 100 is set,
|
||||
and 50 are in the index, it will find 50 more domains.
|
75
code/processes/new-domain-process/build.gradle
Normal file
75
code/processes/new-domain-process/build.gradle
Normal file
@@ -0,0 +1,75 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,146 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.coordination.DomainCoordinator;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.ndp.io.HttpClientProvider;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** Evaluates a domain to determine if it is worth indexing.
|
||||
* This class fetches the root document, checks the response code, content type,
|
||||
* and parses the HTML to ensure it smells alright.
|
||||
*/
|
||||
@Singleton
|
||||
public class DomainEvaluator {
|
||||
private final HttpClient client;
|
||||
private final String userAgentString = WmsaHome.getUserAgent().uaString();
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
private final DomainCoordinator domainCoordinator;
|
||||
|
||||
@Inject
|
||||
public DomainEvaluator(DomainCoordinator domainCoordinator) throws NoSuchAlgorithmException, KeyManagementException {
|
||||
this.domainCoordinator = domainCoordinator;
|
||||
client = HttpClientProvider.createClient();
|
||||
}
|
||||
|
||||
public boolean evaluateDomain(String domainName) {
|
||||
var edgeDomain = new EdgeDomain(domainName);
|
||||
|
||||
// Grab a lock on the domain to prevent concurrent evaluations between processes
|
||||
try (var lock = domainCoordinator.lockDomain(edgeDomain)) {
|
||||
var rootUrl = edgeDomain.toRootUrlHttps();
|
||||
|
||||
var request = ClassicRequestBuilder.get(rootUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/html,application/xhtml+xml;q=0.9")
|
||||
.build();
|
||||
|
||||
return client.execute(request, (rsp) -> {
|
||||
if (rsp.getEntity() == null)
|
||||
return false;
|
||||
|
||||
try {
|
||||
// Check if the response code indicates a successful fetch
|
||||
if (200 != rsp.getCode()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
byte[] content;
|
||||
// Read the content from the response entity
|
||||
try (InputStream contentStream = rsp.getEntity().getContent()) {
|
||||
content = contentStream.readNBytes(8192);
|
||||
}
|
||||
|
||||
// Parse the content (if it's valid)
|
||||
ContentType contentType = ContentType.parse(rsp.getEntity().getContentType());
|
||||
|
||||
// Validate the content type
|
||||
if (!contentType.contentType().startsWith("text/html") && !contentType.contentType().startsWith("application/xhtml+xml"))
|
||||
return false;
|
||||
|
||||
// Parse the document body to a Jsoup Document
|
||||
final Document document = Jsoup.parse(DocumentBodyToString.getStringData(contentType, content));
|
||||
final String text = document.body().text();
|
||||
|
||||
if (text.length() < 100)
|
||||
return false;
|
||||
if (text.contains("404 Not Found") || text.contains("Page not found"))
|
||||
return false;
|
||||
if (hasMetaRefresh(document))
|
||||
return false; // This almost always indicates a parked domain
|
||||
if (!hasInternalLink(document, edgeDomain, rootUrl))
|
||||
return false; // No internal links means it's not worth indexing
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
finally {
|
||||
// May or may not be necessary, but let's ensure we clean up the response entity
|
||||
// to avoid resource leaks
|
||||
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||
|
||||
// Sleep for a while before yielding the lock, to avoid immediately hammering the domain
|
||||
// from another process
|
||||
sleepQuietly(Duration.ofSeconds(1));
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return false; // If we fail to fetch or parse the domain, we consider it invalid
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasInternalLink(Document document, EdgeDomain currentDomain, EdgeUrl rootUrl) {
|
||||
for (Element atag : document.select("a")) {
|
||||
Optional<EdgeDomain> destDomain = linkParser
|
||||
.parseLink(rootUrl, atag)
|
||||
.map(EdgeUrl::getDomain);
|
||||
|
||||
if (destDomain.isPresent() && Objects.equals(currentDomain, destDomain.get()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasMetaRefresh(Document document) {
|
||||
for (Element metaTag : document.select("meta")) {
|
||||
if ("refresh".equalsIgnoreCase(metaTag.attr("http-equiv")))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void sleepQuietly(Duration duration) {
|
||||
try {
|
||||
TimeUnit.MILLISECONDS.sleep(duration.toMillis());
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,134 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
|
||||
/** DomainAllocator is responsible for assigning domains to partitions/nodes.
|
||||
* This is ensured to make sure that domains are evenly distributed across the nodes.
|
||||
*/
|
||||
public class DomainNodeAllocator {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final PriorityQueue<NodeCount> countPerNode = new PriorityQueue<>();
|
||||
|
||||
private volatile boolean initialized = false;
|
||||
|
||||
private record NodeCount(int nodeId, int count)
|
||||
implements Comparable<NodeCount>
|
||||
{
|
||||
public NodeCount incrementCount() {
|
||||
return new NodeCount(nodeId, count + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull DomainNodeAllocator.NodeCount o) {
|
||||
return Integer.compare(this.count, o.count);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainNodeAllocator(NodeConfigurationService nodeConfigurationService, HikariDataSource dataSource) {
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.dataSource = dataSource;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainNodeAllocator::initialize()")
|
||||
.start(this::initialize);
|
||||
}
|
||||
|
||||
public synchronized int totalCount() {
|
||||
ensureInitialized();
|
||||
return countPerNode.stream().mapToInt(NodeCount::count).sum();
|
||||
}
|
||||
|
||||
/** Returns the next node ID to assign a domain to.
|
||||
* This method is synchronized to ensure thread safety when multiple threads are allocating domains.
|
||||
* The node ID returned is guaranteed to be one of the viable nodes configured in the system.
|
||||
*/
|
||||
public synchronized int nextNodeId() {
|
||||
ensureInitialized();
|
||||
|
||||
// Synchronized is fine here as this is not a hot path
|
||||
// (and PriorityBlockingQueue won't help since we're re-adding the same element with a new count all the time)
|
||||
|
||||
NodeCount allocation = countPerNode.remove();
|
||||
countPerNode.add(allocation.incrementCount());
|
||||
return allocation.nodeId();
|
||||
}
|
||||
|
||||
|
||||
private void ensureInitialized() {
|
||||
if (initialized) return;
|
||||
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
try {
|
||||
// Wait until the initialization is complete
|
||||
this.wait(1000);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("DomainAllocator initialization interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void initialize() {
|
||||
if (initialized) return;
|
||||
|
||||
Set<Integer> viableNodes = new HashSet<>();
|
||||
|
||||
// Find all viable nodes that can handle batch crawls
|
||||
for (var node : nodeConfigurationService.getAll()) {
|
||||
if (node.disabled())
|
||||
continue;
|
||||
if (!node.autoAssignDomains())
|
||||
continue;
|
||||
|
||||
if (node.profile().permitBatchCrawl())
|
||||
viableNodes.add(node.node());
|
||||
}
|
||||
|
||||
// Fetch the current counts of domains per node from the database
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT COUNT(*) AS CNT, NODE_AFFINITY
|
||||
FROM EC_DOMAIN
|
||||
WHERE NODE_AFFINITY>0
|
||||
GROUP BY NODE_AFFINITY
|
||||
"""))
|
||||
{
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
|
||||
int nodeId = rs.getInt("NODE_AFFINITY");
|
||||
int count = rs.getInt("CNT");
|
||||
|
||||
if (viableNodes.remove(nodeId)) {
|
||||
countPerNode.add(new NodeCount(nodeId, count));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load domain counts from database", e);
|
||||
}
|
||||
|
||||
// Add any remaining viable nodes that were not found in the database
|
||||
for (int nodeId : viableNodes) {
|
||||
countPerNode.add(new NodeCount(nodeId, 0));
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,240 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class DomainTestingQueue {
|
||||
private static Logger logger = LoggerFactory.getLogger(DomainTestingQueue.class);
|
||||
|
||||
private final ArrayBlockingQueue<DomainToTest> queue = new ArrayBlockingQueue<>(2);
|
||||
|
||||
// This will grow quite large, but should be manageable in memory, as theoretical maximum is around 100M domains,
|
||||
// order of 2 GB in memory.
|
||||
private final ConcurrentHashMap<String, Boolean> takenDomains = new ConcurrentHashMap<>();
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
private final AggregateLinkGraphClient linkGraphClient;
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainTestingQueue(HikariDataSource dataSource,
|
||||
AggregateLinkGraphClient linkGraphClient
|
||||
) {
|
||||
this.dataSource = dataSource;
|
||||
this.linkGraphClient = linkGraphClient;
|
||||
|
||||
Thread.ofPlatform()
|
||||
.name("DomainTestingQueue::fetch()")
|
||||
.start(this::fetch);
|
||||
}
|
||||
|
||||
public DomainToTest next() throws InterruptedException {
|
||||
return queue.take();
|
||||
}
|
||||
|
||||
public void accept(DomainToTest domain, int nodeId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var flagOkStmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='ACCEPTED'
|
||||
WHERE DOMAIN_ID=?
|
||||
""");
|
||||
var assignNodeStmt = conn.prepareStatement("""
|
||||
UPDATE EC_DOMAIN SET NODE_AFFINITY=?
|
||||
WHERE ID=?
|
||||
AND EC_DOMAIN.NODE_AFFINITY < 0
|
||||
""")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
flagOkStmt.setInt(1, domain.domainId());
|
||||
flagOkStmt.executeUpdate();
|
||||
|
||||
assignNodeStmt.setInt(1, nodeId);
|
||||
assignNodeStmt.setInt(2, domain.domainId());
|
||||
assignNodeStmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to accept domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void reject(DomainToTest domain) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
UPDATE NDP_NEW_DOMAINS
|
||||
SET STATE='REJECTED', CHECK_COUNT=CHECK_COUNT + 1
|
||||
WHERE DOMAIN_ID=?
|
||||
"""))
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
stmt.setInt(1, domain.domainId());
|
||||
stmt.executeUpdate();
|
||||
conn.commit();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to reject domain in database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void fetch() {
|
||||
while (true) {
|
||||
List<DomainToTest> domains = new ArrayList<>(2000);
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, DOMAIN_NAME
|
||||
FROM NDP_NEW_DOMAINS
|
||||
INNER JOIN EC_DOMAIN ON ID=DOMAIN_ID
|
||||
WHERE NDP_NEW_DOMAINS.STATE = 'NEW'
|
||||
ORDER BY PRIORITY DESC
|
||||
LIMIT 2000
|
||||
"""))
|
||||
{
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
String domainName = rs.getString("DOMAIN_NAME");
|
||||
if (takenDomains.put(domainName, true) != null) {
|
||||
logger.warn("Domain {} is already processed, skipping", domainName);
|
||||
continue; // Skip if already taken
|
||||
}
|
||||
domains.add(new DomainToTest(domainName, domainId));
|
||||
}
|
||||
|
||||
if (domains.isEmpty()) {
|
||||
if (!refreshQueue(conn)) {
|
||||
throw new RuntimeException("No new domains found, aborting!");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (RuntimeException e) {
|
||||
throw e; // Rethrow runtime exceptions to avoid wrapping them in another runtime exception
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to fetch domains from database", e);
|
||||
}
|
||||
|
||||
try {
|
||||
for (var domain : domains) {
|
||||
queue.put(domain);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException("Domain fetching interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean refreshQueue(Connection conn) {
|
||||
logger.info("Refreshing domain queue in database");
|
||||
|
||||
Int2IntMap domainIdToCount = new Int2IntOpenHashMap();
|
||||
|
||||
// Load known domain IDs from the database to avoid inserting duplicates from NDP_NEW_DOMAINS
|
||||
// or domains that are already assigned to a node
|
||||
{
|
||||
IntOpenHashSet knownIds = new IntOpenHashSet();
|
||||
|
||||
try (var stmt = conn.createStatement()) {
|
||||
ResultSet rs = stmt.executeQuery("SELECT DOMAIN_ID FROM NDP_NEW_DOMAINS");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("DOMAIN_ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery("SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0");
|
||||
rs.setFetchSize(10_000);
|
||||
while (rs.next()) {
|
||||
int domainId = rs.getInt("ID");
|
||||
knownIds.add(domainId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to load known domain IDs from database", e);
|
||||
}
|
||||
|
||||
// Ensure the link graph is ready before proceeding. This is mainly necessary in a cold reboot
|
||||
// of the entire system.
|
||||
try {
|
||||
logger.info("Waiting for link graph client to be ready...");
|
||||
linkGraphClient.waitReady(Duration.ofHours(1));
|
||||
logger.info("Link graph client is ready, fetching domain links...");
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// Fetch all domain links from the link graph and count by how many sources each dest domain is linked from
|
||||
var iter = linkGraphClient.getAllDomainLinks().iterator();
|
||||
while (iter.advance()) {
|
||||
int dest = iter.dest();
|
||||
if (!knownIds.contains(dest)) {
|
||||
domainIdToCount.mergeInt(dest, 1, (i, j) -> i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean didInsert = false;
|
||||
|
||||
/* Insert new domains into NDP_NEW_DOMAINS table */
|
||||
try (var insertStmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NDP_NEW_DOMAINS (DOMAIN_ID, PRIORITY) VALUES (?, ?)
|
||||
""")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0;
|
||||
for (var entry : domainIdToCount.int2IntEntrySet()) {
|
||||
int domainId = entry.getIntKey();
|
||||
int count = entry.getIntValue();
|
||||
|
||||
insertStmt.setInt(1, domainId);
|
||||
insertStmt.setInt(2, count);
|
||||
insertStmt.addBatch();
|
||||
|
||||
if (++cnt >= 1000) {
|
||||
cnt = 0;
|
||||
insertStmt.executeBatch(); // Execute in batches to avoid memory issues
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
}
|
||||
if (cnt != 0) {
|
||||
insertStmt.executeBatch(); // Execute any remaining batch
|
||||
conn.commit();
|
||||
didInsert = true;
|
||||
}
|
||||
|
||||
logger.info("Queue refreshed successfully");
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to refresh queue in database", e);
|
||||
}
|
||||
|
||||
// Clean up NDP_NEW_DOMAINS table to remove any domains that are already in EC_DOMAIN
|
||||
// This acts not only to clean up domains that we've flagged as ACCEPTED, but also to
|
||||
// repair inconsistent states where domains might have incorrectly been added to NDP_NEW_DOMAINS
|
||||
try (var stmt = conn.createStatement()) {
|
||||
stmt.executeUpdate("DELETE FROM NDP_NEW_DOMAINS WHERE DOMAIN_ID IN (SELECT ID FROM EC_DOMAIN WHERE NODE_AFFINITY>=0)");
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException("Failed to clean up NDP_NEW_DOMAINS", e);
|
||||
}
|
||||
|
||||
return didInsert;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,159 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.coordination.DomainCoordinationModule;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.mqapi.ndp.NdpRequest;
|
||||
import nu.marginalia.ndp.model.DomainToTest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.Security;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class NdpMain extends ProcessMainClass {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NdpMain.class);
|
||||
private final DomainNodeAllocator domainNodeAllocator;
|
||||
private final DomainTestingQueue domainTestingQueue;
|
||||
private final ProcessHeartbeat processHeartbeat;
|
||||
private final DomainEvaluator domainEvaluator;
|
||||
private final DomainBlacklist domainBlacklist;
|
||||
|
||||
private final AtomicInteger domainCount = new AtomicInteger(0);
|
||||
|
||||
@Inject
|
||||
public NdpMain(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
DomainNodeAllocator domainNodeAllocator,
|
||||
DomainTestingQueue domainTestingQueue,
|
||||
DomainEvaluator domainEvaluator,
|
||||
DomainBlacklist domainBlacklist,
|
||||
ProcessHeartbeat processHeartbeat,
|
||||
Gson gson)
|
||||
{
|
||||
super(messageQueueFactory, config, gson, ProcessInboxNames.NDP_INBOX);
|
||||
|
||||
this.domainNodeAllocator = domainNodeAllocator;
|
||||
this.domainEvaluator = domainEvaluator;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
this.domainTestingQueue = domainTestingQueue;
|
||||
this.processHeartbeat = processHeartbeat;
|
||||
}
|
||||
|
||||
|
||||
public void run(int goalCount) throws InterruptedException {
|
||||
logger.info("Wait for blacklist to load...");
|
||||
domainBlacklist.waitUntilLoaded();
|
||||
|
||||
SimpleBlockingThreadPool threadPool = new SimpleBlockingThreadPool(
|
||||
"NDP-Worker",
|
||||
8,
|
||||
10,
|
||||
SimpleBlockingThreadPool.ThreadType.PLATFORM
|
||||
);
|
||||
|
||||
logger.info("Starting NDP process");
|
||||
|
||||
int toInsertCount = goalCount - domainNodeAllocator.totalCount();
|
||||
|
||||
if (toInsertCount <= 0) {
|
||||
logger.info("No new domains to process. Current count: " + domainNodeAllocator.totalCount());
|
||||
return;
|
||||
}
|
||||
|
||||
try (var hb = processHeartbeat.createAdHocTaskHeartbeat("Growing Index")) {
|
||||
int cnt;
|
||||
while ((cnt = domainCount.get()) < toInsertCount) {
|
||||
if (cnt % 100 == 0) {
|
||||
hb.progress("Discovery Process", cnt, toInsertCount);
|
||||
}
|
||||
|
||||
final DomainToTest nextDomain = domainTestingQueue.next();
|
||||
threadPool.submit(() -> {
|
||||
try {
|
||||
if (domainEvaluator.evaluateDomain(nextDomain.domainName())) {
|
||||
logger.info("Accepting: {}", nextDomain.domainName());
|
||||
domainCount.incrementAndGet();
|
||||
domainTestingQueue.accept(nextDomain, domainNodeAllocator.nextNodeId());
|
||||
} else {
|
||||
logger.info("Rejecting: {}", nextDomain.domainName());
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
domainTestingQueue.reject(nextDomain);
|
||||
logger.error("Error evaluating domain: " + nextDomain.domainId(), e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
threadPool.shutDown();
|
||||
// Wait for all tasks to complete or give up after 1 hour
|
||||
threadPool.awaitTermination(1, TimeUnit.HOURS);
|
||||
|
||||
logger.info("NDP process completed. Total domains processed: " + domainCount.get());
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||
|
||||
// This must run *early*
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
|
||||
// If these aren't set properly, the JVM will hang forever on some requests
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new NdpModule(),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DomainCoordinationModule(),
|
||||
new ProcessConfigurationModule("ndp"),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
GeoIpDictionary geoIpDictionary = injector.getInstance(GeoIpDictionary.class);
|
||||
|
||||
geoIpDictionary.waitReady(); // Ensure the GeoIpDictionary is ready before proceeding
|
||||
|
||||
NdpMain main = injector.getInstance(NdpMain.class);
|
||||
|
||||
var instructions = main.fetchInstructions(NdpRequest.class);
|
||||
|
||||
try {
|
||||
main.run(instructions.value().goal());
|
||||
instructions.ok();
|
||||
}
|
||||
catch (Throwable ex) {
|
||||
logger.error("Error running ping process", ex);
|
||||
instructions.err();
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,8 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
|
||||
public class NdpModule extends AbstractModule {
|
||||
public void configure() {
|
||||
}
|
||||
}
|
@@ -0,0 +1,126 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import com.google.inject.Provider;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.core5.http.HeaderElement;
|
||||
import org.apache.hc.core5.http.HeaderElements;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class HttpClientProvider implements Provider<HttpClient> {
|
||||
private static final HttpClient client;
|
||||
private static PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(HttpClientProvider.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
client = createClient();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static CloseableHttpClient createClient() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(50)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(new RetryStrategy())
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpClient get() {
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,79 @@
|
||||
package nu.marginalia.ndp.io;
|
||||
|
||||
import org.apache.hc.client5.http.HttpHostConnectException;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.core5.http.HttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
public class RetryStrategy implements HttpRequestRetryStrategy {
|
||||
private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class);
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return switch (exception) {
|
||||
case SocketTimeoutException ste -> false;
|
||||
case SSLException ssle -> false;
|
||||
case UnknownHostException uhe -> false;
|
||||
case HttpHostConnectException ex -> executionCount < 2;
|
||||
case SocketException ex -> executionCount < 2;
|
||||
default -> executionCount <= 3;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
var retryAfterHeader = response.getFirstHeader("Retry-After");
|
||||
if (retryAfterHeader == null) {
|
||||
return TimeValue.ofSeconds(3);
|
||||
}
|
||||
|
||||
String retryAfter = retryAfterHeader.getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
}
|
@@ -0,0 +1,4 @@
|
||||
package nu.marginalia.ndp.model;
|
||||
|
||||
public record DomainToTest(String domainName, int domainId) {
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.ndp;
|
||||
|
||||
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.security.KeyManagementException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class DomainEvaluatorTest {
|
||||
|
||||
@Tag("flaky") // Exclude from CI runs due to potential network issues
|
||||
@Test
|
||||
public void testSunnyDay() throws NoSuchAlgorithmException, KeyManagementException {
|
||||
DomainEvaluator evaluator = new DomainEvaluator(new LocalDomainCoordinator());
|
||||
|
||||
// Should be a valid domain
|
||||
assertTrue(evaluator.evaluateDomain("www.marginalia.nu"));
|
||||
|
||||
// Should be a redirect to www.marginalia.nu
|
||||
assertFalse(evaluator.evaluateDomain("memex.marginalia.nu"));
|
||||
|
||||
// Should fail on Anubis
|
||||
assertFalse(evaluator.evaluateDomain("marginalia-search.com"));
|
||||
}
|
||||
}
|
12
code/processes/ping-process/README.md
Normal file
12
code/processes/ping-process/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
The ping process (which has nothing to do with ICMP ping) keeps track of
|
||||
the aliveness of websites. It also gathers fingerprint information about
|
||||
the security posture of the website, as well as DNS information.
|
||||
|
||||
This is kept to build an idea of when a website is down, and to identify
|
||||
ownership changes, as well as other significant events in the lifecycle
|
||||
of a website.
|
||||
|
||||
# Central Classes
|
||||
|
||||
* [PingMain](java/nu/marginalia/ping/PingMain.java) main class.
|
||||
* [PingJobScheduler](java/nu/marginalia/ping/PingJobScheduler.java) service that dispatches pings.
|
72
code/processes/ping-process/build.gradle
Normal file
72
code/processes/ping-process/build.gradle
Normal file
@@ -0,0 +1,72 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.ping.PingMain'
|
||||
applicationName = 'ping-process'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation project(':code:libraries:domain-lock')
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.zstd
|
||||
implementation libs.bucket4j
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.jsoup
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation 'org.bouncycastle:bcprov-jdk18on:1.80'
|
||||
implementation 'org.bouncycastle:bcpkix-jdk18on:1.80'
|
||||
implementation 'dnsjava:dnsjava:3.5.2'
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -0,0 +1,84 @@
|
||||
package nu.marginalia.ping;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.ping.model.ErrorClassification;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
public class BackoffStrategy {
|
||||
|
||||
private final Map<ErrorClassification, Duration> baseIntervals;
|
||||
private final Map<ErrorClassification, Duration> maxIntervals;
|
||||
private final Duration okInterval;
|
||||
|
||||
@Inject
|
||||
public BackoffStrategy(PingIntervalsConfiguration pingIntervalsConfiguration) {
|
||||
this.baseIntervals = pingIntervalsConfiguration.baseIntervals();
|
||||
this.maxIntervals = pingIntervalsConfiguration.maxIntervals();
|
||||
this.okInterval = baseIntervals.get(ErrorClassification.NONE);
|
||||
}
|
||||
|
||||
public Duration getOkInterval() {
|
||||
return okInterval;
|
||||
}
|
||||
|
||||
public Duration getUpdateTime(Duration currentDuration,
|
||||
ErrorClassification errorClassification,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
Duration nextBackoff = calculateBackoff(errorClassification, currentDuration, backoffConsecutiveFailures + 1);
|
||||
nextBackoff = addJitter(nextBackoff);
|
||||
|
||||
return nextBackoff;
|
||||
}
|
||||
|
||||
private Duration calculateBackoff(ErrorClassification errorClassification,
|
||||
Duration currentDuration,
|
||||
int backoffConsecutiveFailures) {
|
||||
|
||||
if (currentDuration == null) {
|
||||
return baseIntervals.get(errorClassification);
|
||||
}
|
||||
|
||||
Duration baseInterval = baseIntervals.get(errorClassification);
|
||||
Duration maxInterval = maxIntervals.get(errorClassification);
|
||||
|
||||
if (currentDuration.compareTo(maxInterval) >= 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
double multiplier = switch(errorClassification) {
|
||||
case ErrorClassification.UNKNOWN -> 1.5;
|
||||
case ErrorClassification.TIMEOUT -> 2.5;
|
||||
case ErrorClassification.CONNECTION_ERROR -> 2.0;
|
||||
case ErrorClassification.HTTP_CLIENT_ERROR -> 1.7;
|
||||
case ErrorClassification.HTTP_SERVER_ERROR -> 2.0;
|
||||
case ErrorClassification.SSL_ERROR -> 1.8;
|
||||
case ErrorClassification.DNS_ERROR -> 1.5;
|
||||
default -> 2.0; // Default multiplier for any other classification
|
||||
};
|
||||
|
||||
double backoffMinutes = baseInterval.toMinutes()
|
||||
* Math.pow(multiplier, backoffConsecutiveFailures - 1);
|
||||
|
||||
Duration newDuration = Duration.ofMinutes(Math.round(0.5+backoffMinutes));
|
||||
if (newDuration.compareTo(maxInterval) > 0) {
|
||||
return maxInterval;
|
||||
}
|
||||
|
||||
return newDuration;
|
||||
}
|
||||
|
||||
private Duration addJitter(Duration duration) {
|
||||
// Add ±15% jitter to prevent synchronized retries
|
||||
double jitterPercent = 0.15;
|
||||
long baseMinutes = duration.toMinutes();
|
||||
long jitterRange = (long) (baseMinutes * jitterPercent * 2);
|
||||
long jitterOffset = ThreadLocalRandom.current().nextLong(jitterRange + 1) - (jitterRange / 2);
|
||||
|
||||
long finalMinutes = Math.max(1, baseMinutes + jitterOffset);
|
||||
return Duration.ofMinutes(finalMinutes);
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user