mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
226 Commits
deploy-020
...
deploy-031
Author | SHA1 | Date | |
---|---|---|---|
|
8503030f18 | ||
|
744f7d3ef7 | ||
|
215e12afe9 | ||
|
2716bce918 | ||
|
caf2e6fbb7 | ||
|
233f0acfb1 | ||
|
e3a4ff02e9 | ||
|
c786283ae1 | ||
|
a3f65ac0e0 | ||
|
aba1a32af0 | ||
|
c9c442345b | ||
|
2e126ba30e | ||
|
2087985f49 | ||
|
2b13ebd18b | ||
|
6d92c125fe | ||
|
f638cfa39a | ||
|
89447c12af | ||
|
c71fc46f04 | ||
|
f96874d828 | ||
|
583a84d5a0 | ||
|
f65b946448 | ||
|
3682815855 | ||
|
3a94357660 | ||
|
673b0d3de1 | ||
|
ea942bc664 | ||
|
7ed5083c54 | ||
|
08bb2c097b | ||
|
495fb325be | ||
|
05c25bbaec | ||
|
2a028b84f3 | ||
|
a091a23623 | ||
|
e8897acb45 | ||
|
b89ffcf2be | ||
|
dbcc9055b0 | ||
|
d9740557f4 | ||
|
0d6cd015fd | ||
|
c6034efcc8 | ||
|
76068014ad | ||
|
1c3ed67127 | ||
|
fc0cb6bd9a | ||
|
c2601bac78 | ||
|
f5641b72e9 | ||
|
36efe2e219 | ||
|
983fe3829e | ||
|
668c87aa86 | ||
|
9d3f9adb05 | ||
|
a43a1773f1 | ||
|
1e7a3a3c4f | ||
|
62b696b1c3 | ||
|
f1a900f383 | ||
|
700364b86d | ||
|
7e725ddaed | ||
|
120209e138 | ||
|
a771a5b6ce | ||
|
dac5b54128 | ||
|
6cfb143c15 | ||
|
23c818281b | ||
|
8aad253cf6 | ||
|
556d7af9dc | ||
|
b7a5219ed3 | ||
|
a23ec521fe | ||
|
fff3babc6d | ||
|
b2bfb8217c | ||
|
3b2ac414dc | ||
|
0ba6515a01 | ||
|
16c6b0f151 | ||
|
e998692900 | ||
|
eeb1695a87 | ||
|
a0ab910940 | ||
|
b9f31048d7 | ||
|
12c304289a | ||
|
6ee01dabea | ||
|
1b80e282a7 | ||
|
a65d18f1d1 | ||
|
90a1ff220b | ||
|
d6c7092335 | ||
|
b716333856 | ||
|
b504b8482c | ||
|
80da1e9ad1 | ||
|
d3f744a441 | ||
|
60fb539875 | ||
|
7f5094fedf | ||
|
45066636a5 | ||
|
e2d6898c51 | ||
|
58ef767b94 | ||
|
f9f268c67a | ||
|
f44c2bdee9 | ||
|
6fdf477c18 | ||
|
6b6e455e3f | ||
|
a3a126540c | ||
|
842b19da40 | ||
|
2a30e93bf0 | ||
|
3d998f12c0 | ||
|
cbccc2ac23 | ||
|
2cfc23f9b7 | ||
|
88fe394cdb | ||
|
f30fcebd4f | ||
|
5d885927b4 | ||
|
7622c8358e | ||
|
69ed9aef47 | ||
|
4c78c223da | ||
|
71b9935dd6 | ||
|
ad38f2fd83 | ||
|
9c47388846 | ||
|
d9ab10e33f | ||
|
e13ea7f42b | ||
|
f38daeb036 | ||
|
6e214293e5 | ||
|
52582a6d7d | ||
|
ec0e39ad32 | ||
|
6a15aee4b0 | ||
|
bd5111e8a2 | ||
|
1ecbeb0272 | ||
|
b91354925d | ||
|
3f85c9c154 | ||
|
390f053406 | ||
|
89e03d6914 | ||
|
14e0bc9f26 | ||
|
7065b46c6f | ||
|
0372190c90 | ||
|
ceaf32fb90 | ||
|
b03c43224c | ||
|
9b4ce9e9eb | ||
|
81ac02a695 | ||
|
47f624fb3b | ||
|
b57db01415 | ||
|
ce7d522608 | ||
|
18649b6ee9 | ||
|
f6417aef1a | ||
|
2aa7e376b0 | ||
|
f33bc44860 | ||
|
a2826efd44 | ||
|
c866f19cbb | ||
|
518278493b | ||
|
1ac0bab0b8 | ||
|
08b45ed10a | ||
|
f2cfb91973 | ||
|
2f79524eb3 | ||
|
3b00142c96 | ||
|
294ab19177 | ||
|
6f1659ecb2 | ||
|
982dcb28f0 | ||
|
fc686d8b2e | ||
|
69ef0f334a | ||
|
446746f3bd | ||
|
24ab8398bb | ||
|
d2ceeff4cf | ||
|
cf64214b1c | ||
|
e50d09cc01 | ||
|
bce3892ce0 | ||
|
36581b25c2 | ||
|
52ff7fb4dd | ||
|
a4e49e658a | ||
|
e2c56dc3ca | ||
|
470b866008 | ||
|
4895a2ac7a | ||
|
fd32ae9fa7 | ||
|
470651ea4c | ||
|
8d4829e783 | ||
|
1290bc15dc | ||
|
e7fa558954 | ||
|
720685bf3f | ||
|
cbec63c7da | ||
|
b03ca75785 | ||
|
184aedc071 | ||
|
0275bad281 | ||
|
fd83a9d0b8 | ||
|
d556f8ae3a | ||
|
e37559837b | ||
|
3564c4aaee | ||
|
92c54563ab | ||
|
d7a5d90b07 | ||
|
0a0e88fd6e | ||
|
b4fc0c4368 | ||
|
87ee8765b8 | ||
|
1adf4835fa | ||
|
b7b5d0bf46 | ||
|
416059adde | ||
|
db7930016a | ||
|
82456ad673 | ||
|
0882a6d9cd | ||
|
5020029c2d | ||
|
ac44d0b093 | ||
|
4b32b9b10e | ||
|
9f041d6631 | ||
|
13fb1efce4 | ||
|
c1225165b7 | ||
|
67ad7a3bbc | ||
|
ed62ec8a35 | ||
|
42b24cfa34 | ||
|
1ffaab2da6 | ||
|
5f93c7f767 | ||
|
4001c68c82 | ||
|
6b811489c5 | ||
|
e9d317c65d | ||
|
16b05a4737 | ||
|
021cd73cbb | ||
|
4253bd53b5 | ||
|
14c87461a5 | ||
|
9afed0a18e | ||
|
afad4deb94 | ||
|
f071c947e4 | ||
|
79996c9348 | ||
|
db907ab06a | ||
|
c49cd9dd95 | ||
|
eec9df3b0a | ||
|
e5f3288de6 | ||
|
d587544d3a | ||
|
1a9ae1bc40 | ||
|
e0c81e956a | ||
|
542fb12b38 | ||
|
65ec734566 | ||
|
10b6a25c63 | ||
|
6260f6bec7 | ||
|
d6d5467696 | ||
|
034560ca75 | ||
|
e994fddae4 | ||
|
345f01f306 | ||
|
5a8e286689 | ||
|
39a055aa94 | ||
|
37aaa90dc9 | ||
|
24022c5adc | ||
|
1de9ecc0b6 | ||
|
9b80245ea0 | ||
|
4e1595c1a6 | ||
|
0be8585fa5 |
24
ROADMAP.md
24
ROADMAP.md
@@ -38,14 +38,6 @@ associated with each language added, at least a models file or two, as well as s
|
|||||||
|
|
||||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||||
|
|
||||||
## Support for binary formats like PDF
|
|
||||||
|
|
||||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
|
||||||
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
|
||||||
|
|
||||||
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
|
||||||
that direction as well.
|
|
||||||
|
|
||||||
## Custom ranking logic
|
## Custom ranking logic
|
||||||
|
|
||||||
Stract does an interesting thing where they have configurable search filters.
|
Stract does an interesting thing where they have configurable search filters.
|
||||||
@@ -56,16 +48,24 @@ filter for any API consumer.
|
|||||||
|
|
||||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||||
|
|
||||||
## Show favicons next to search results
|
|
||||||
|
|
||||||
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
|
||||||
|
|
||||||
## Specialized crawler for github
|
## Specialized crawler for github
|
||||||
|
|
||||||
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
|
|
||||||
|
## Support for binary formats like PDF (COMPLETED 2025-05)
|
||||||
|
|
||||||
|
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||||
|
The documents database probably should have some sort of flag indicating it's a PDF as well.
|
||||||
|
|
||||||
|
PDF parsing is known to be a bit of a security liability so some thought needs to be put in
|
||||||
|
that direction as well.
|
||||||
|
|
||||||
|
## Show favicons next to search results (COMPLETED 2025-03)
|
||||||
|
|
||||||
|
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||||
|
|
||||||
## Web Design Overhaul (COMPLETED 2025-01)
|
## Web Design Overhaul (COMPLETED 2025-01)
|
||||||
|
|
||||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
@@ -1,3 +1,8 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A record representing a User Agent.
|
||||||
|
* @param uaString - the header value of the User Agent
|
||||||
|
* @param uaIdentifier - what we look for in robots.txt
|
||||||
|
*/
|
||||||
public record UserAgent(String uaString, String uaIdentifier) {}
|
public record UserAgent(String uaString, String uaIdentifier) {}
|
||||||
|
@@ -45,7 +45,7 @@ public class NodeConfigurationService {
|
|||||||
public List<NodeConfiguration> getAll() {
|
public List<NodeConfiguration> getAll() {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
""")) {
|
""")) {
|
||||||
var rs = qs.executeQuery();
|
var rs = qs.executeQuery();
|
||||||
@@ -59,6 +59,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("ACCEPT_QUERIES"),
|
rs.getBoolean("ACCEPT_QUERIES"),
|
||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
|
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
@@ -75,7 +76,7 @@ public class NodeConfigurationService {
|
|||||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var qs = conn.prepareStatement("""
|
var qs = conn.prepareStatement("""
|
||||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, AUTO_ASSIGN_DOMAINS, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||||
FROM NODE_CONFIGURATION
|
FROM NODE_CONFIGURATION
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
""")) {
|
""")) {
|
||||||
@@ -88,6 +89,7 @@ public class NodeConfigurationService {
|
|||||||
rs.getBoolean("ACCEPT_QUERIES"),
|
rs.getBoolean("ACCEPT_QUERIES"),
|
||||||
rs.getBoolean("AUTO_CLEAN"),
|
rs.getBoolean("AUTO_CLEAN"),
|
||||||
rs.getBoolean("PRECESSION"),
|
rs.getBoolean("PRECESSION"),
|
||||||
|
rs.getBoolean("AUTO_ASSIGN_DOMAINS"),
|
||||||
rs.getBoolean("KEEP_WARCS"),
|
rs.getBoolean("KEEP_WARCS"),
|
||||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||||
rs.getBoolean("DISABLED")
|
rs.getBoolean("DISABLED")
|
||||||
@@ -102,7 +104,7 @@ public class NodeConfigurationService {
|
|||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var us = conn.prepareStatement("""
|
var us = conn.prepareStatement("""
|
||||||
UPDATE NODE_CONFIGURATION
|
UPDATE NODE_CONFIGURATION
|
||||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, AUTO_ASSIGN_DOMAINS=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||||
WHERE ID=?
|
WHERE ID=?
|
||||||
"""))
|
"""))
|
||||||
{
|
{
|
||||||
@@ -110,10 +112,11 @@ public class NodeConfigurationService {
|
|||||||
us.setBoolean(2, config.acceptQueries());
|
us.setBoolean(2, config.acceptQueries());
|
||||||
us.setBoolean(3, config.autoClean());
|
us.setBoolean(3, config.autoClean());
|
||||||
us.setBoolean(4, config.includeInPrecession());
|
us.setBoolean(4, config.includeInPrecession());
|
||||||
us.setBoolean(5, config.keepWarcs());
|
us.setBoolean(5, config.autoAssignDomains());
|
||||||
us.setBoolean(6, config.disabled());
|
us.setBoolean(6, config.keepWarcs());
|
||||||
us.setString(7, config.profile().name());
|
us.setBoolean(7, config.disabled());
|
||||||
us.setInt(8, config.node());
|
us.setString(8, config.profile().name());
|
||||||
|
us.setInt(9, config.node());
|
||||||
|
|
||||||
if (us.executeUpdate() <= 0)
|
if (us.executeUpdate() <= 0)
|
||||||
throw new IllegalStateException("Failed to update configuration");
|
throw new IllegalStateException("Failed to update configuration");
|
||||||
|
@@ -5,6 +5,7 @@ public record NodeConfiguration(int node,
|
|||||||
boolean acceptQueries,
|
boolean acceptQueries,
|
||||||
boolean autoClean,
|
boolean autoClean,
|
||||||
boolean includeInPrecession,
|
boolean includeInPrecession,
|
||||||
|
boolean autoAssignDomains,
|
||||||
boolean keepWarcs,
|
boolean keepWarcs,
|
||||||
NodeProfile profile,
|
NodeProfile profile,
|
||||||
boolean disabled
|
boolean disabled
|
||||||
|
@@ -20,9 +20,7 @@ public enum NodeProfile {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean permitBatchCrawl() {
|
public boolean permitBatchCrawl() {
|
||||||
return isBatchCrawl() ||isMixed();
|
return isBatchCrawl() || isMixed();
|
||||||
}
|
|
||||||
public boolean permitSideload() {
|
|
||||||
return isMixed() || isSideload();
|
|
||||||
}
|
}
|
||||||
|
public boolean permitSideload() { return isSideload() || isMixed(); }
|
||||||
}
|
}
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
|||||||
|
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.test.TestMigrationLoader;
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
@@ -62,6 +63,63 @@ public class NodeConfigurationServiceTest {
|
|||||||
assertEquals(2, list.size());
|
assertEquals(2, list.size());
|
||||||
assertEquals(a, list.get(0));
|
assertEquals(a, list.get(0));
|
||||||
assertEquals(b, list.get(1));
|
assertEquals(b, list.get(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Test all the fields that are only exposed via save()
|
||||||
|
@Test
|
||||||
|
public void testSaveChanges() throws SQLException {
|
||||||
|
var original = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||||
|
|
||||||
|
assertEquals(1, original.node());
|
||||||
|
assertEquals("Test", original.description());
|
||||||
|
assertFalse(original.acceptQueries());
|
||||||
|
|
||||||
|
var precession = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
original.autoClean(),
|
||||||
|
original.includeInPrecession(),
|
||||||
|
!original.autoAssignDomains(),
|
||||||
|
original.keepWarcs(),
|
||||||
|
original.profile(),
|
||||||
|
original.disabled()
|
||||||
|
);
|
||||||
|
|
||||||
|
nodeConfigurationService.save(precession);
|
||||||
|
precession = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(original.autoAssignDomains(), precession.autoAssignDomains());
|
||||||
|
|
||||||
|
var autoClean = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
!original.autoClean(),
|
||||||
|
original.includeInPrecession(),
|
||||||
|
original.autoAssignDomains(),
|
||||||
|
original.keepWarcs(),
|
||||||
|
original.profile(),
|
||||||
|
original.disabled()
|
||||||
|
);
|
||||||
|
|
||||||
|
nodeConfigurationService.save(autoClean);
|
||||||
|
autoClean = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(original.autoClean(), autoClean.autoClean());
|
||||||
|
|
||||||
|
var disabled = new NodeConfiguration(
|
||||||
|
original.node(),
|
||||||
|
"Foo",
|
||||||
|
true,
|
||||||
|
autoClean.autoClean(),
|
||||||
|
autoClean.includeInPrecession(),
|
||||||
|
autoClean.autoAssignDomains(),
|
||||||
|
autoClean.keepWarcs(),
|
||||||
|
autoClean.profile(),
|
||||||
|
!autoClean.disabled()
|
||||||
|
);
|
||||||
|
nodeConfigurationService.save(disabled);
|
||||||
|
disabled = nodeConfigurationService.get(original.node());
|
||||||
|
assertNotEquals(autoClean.disabled(), disabled.disabled());
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -0,0 +1,5 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS WMSA_prod.NSFW_DOMAINS (
|
||||||
|
ID INT NOT NULL AUTO_INCREMENT,
|
||||||
|
TIER INT NOT NULL,
|
||||||
|
PRIMARY KEY (ID)
|
||||||
|
);
|
@@ -0,0 +1,213 @@
|
|||||||
|
|
||||||
|
-- Create metadata tables for domain ping status and security information
|
||||||
|
|
||||||
|
-- These are not ICMP pings, but rather HTTP(S) pings to check the availability and security
|
||||||
|
-- of web servers associated with domains, to assess uptime and changes in security configurations
|
||||||
|
-- indicating ownership changes or security issues.
|
||||||
|
|
||||||
|
-- Note: DOMAIN_ID and NODE_ID are used to identify the domain and the node that performed the ping.
|
||||||
|
-- These are strictly speaking foreign keys to the EC_DOMAIN table, but as it
|
||||||
|
-- is strictly append-only, we do not need to enforce foreign key constraints.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
SERVER_AVAILABLE BOOLEAN NOT NULL, -- Indicates if the server is available (true) or not (false)
|
||||||
|
SERVER_IP VARBINARY(16), -- IP address of the server (IPv4 or IPv6)
|
||||||
|
SERVER_IP_ASN INTEGER, -- Autonomous System number
|
||||||
|
|
||||||
|
DATA_HASH BIGINT, -- Hash of the data for integrity checks
|
||||||
|
SECURITY_CONFIG_HASH BIGINT, -- Hash of the security configuration for integrity checks
|
||||||
|
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_ETAG VARCHAR(255), -- ETag of the resource as per HTTP headers
|
||||||
|
HTTP_LAST_MODIFIED VARCHAR(255), -- Last modified date of the resource as per HTTP headers
|
||||||
|
HTTP_STATUS INT, -- HTTP status code (e.g., 200, 404, etc.)
|
||||||
|
HTTP_LOCATION VARCHAR(255), -- If the server redirects, this is the location of the redirect
|
||||||
|
HTTP_RESPONSE_TIME_MS SMALLINT UNSIGNED, -- Response time in milliseconds
|
||||||
|
|
||||||
|
ERROR_CLASSIFICATION ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN'), -- Classification of the error if the server is not available
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Error message if the server is not available
|
||||||
|
|
||||||
|
TS_LAST_PING TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the last ping
|
||||||
|
TS_LAST_AVAILABLE TIMESTAMP, -- Timestamp of the last time the server was available
|
||||||
|
TS_LAST_ERROR TIMESTAMP, -- Timestamp of the last error encountered
|
||||||
|
|
||||||
|
NEXT_SCHEDULED_UPDATE TIMESTAMP NOT NULL,
|
||||||
|
BACKOFF_CONSECUTIVE_FAILURES INT NOT NULL DEFAULT 0, -- Number of consecutive failures to ping the server
|
||||||
|
BACKOFF_FETCH_INTERVAL INT NOT NULL DEFAULT 60 -- Interval in seconds for the next scheduled ping
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_AVAILABILITY_INFORMATION__NEXT_SCHEDULED_UPDATE_IDX ON DOMAIN_AVAILABILITY_INFORMATION (NODE_ID, NEXT_SCHEDULED_UPDATE);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_INFORMATION (
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
ASN INTEGER, -- Autonomous System Number (ASN) of the server
|
||||||
|
HTTP_SCHEMA ENUM('HTTP', 'HTTPS'), -- HTTP or HTTPS protocol used
|
||||||
|
HTTP_VERSION VARCHAR(10), -- HTTP version used (e.g., HTTP/1.1, HTTP/2)
|
||||||
|
HTTP_COMPRESSION VARCHAR(50), -- Compression method used (e.g., gzip, deflate, br)
|
||||||
|
HTTP_CACHE_CONTROL TEXT, -- Cache control directives from HTTP headers
|
||||||
|
|
||||||
|
SSL_CERT_NOT_BEFORE TIMESTAMP, -- Valid from date (usually same as issued)
|
||||||
|
SSL_CERT_NOT_AFTER TIMESTAMP, -- Valid until date (usually same as expires)
|
||||||
|
|
||||||
|
SSL_CERT_ISSUER VARCHAR(255), -- CA that issued the cert
|
||||||
|
SSL_CERT_SUBJECT VARCHAR(255), -- Certificate subject/CN
|
||||||
|
|
||||||
|
SSL_CERT_PUBLIC_KEY_HASH BINARY(32), -- SHA-256 hash of the public key
|
||||||
|
SSL_CERT_SERIAL_NUMBER VARCHAR(100), -- Unique cert serial number
|
||||||
|
SSL_CERT_FINGERPRINT_SHA256 BINARY(32), -- SHA-256 fingerprint for exact identification
|
||||||
|
SSL_CERT_SAN TEXT, -- Subject Alternative Names (JSON array)
|
||||||
|
SSL_CERT_WILDCARD BOOLEAN, -- Wildcard certificate (*.example.com)
|
||||||
|
|
||||||
|
SSL_PROTOCOL VARCHAR(20), -- TLS 1.2, TLS 1.3, etc.
|
||||||
|
SSL_CIPHER_SUITE VARCHAR(100), -- e.g., TLS_AES_256_GCM_SHA384
|
||||||
|
SSL_KEY_EXCHANGE VARCHAR(50), -- ECDHE, RSA, etc.
|
||||||
|
SSL_CERTIFICATE_CHAIN_LENGTH TINYINT, -- Number of certs in chain
|
||||||
|
|
||||||
|
SSL_CERTIFICATE_VALID BOOLEAN, -- Valid cert chain
|
||||||
|
|
||||||
|
HEADER_CORS_ALLOW_ORIGIN TEXT, -- Could be *, specific domains, or null
|
||||||
|
HEADER_CORS_ALLOW_CREDENTIALS BOOLEAN, -- Credential handling
|
||||||
|
HEADER_CONTENT_SECURITY_POLICY_HASH INT, -- CSP header, hash of the policy
|
||||||
|
HEADER_STRICT_TRANSPORT_SECURITY VARCHAR(255), -- HSTS header
|
||||||
|
HEADER_REFERRER_POLICY VARCHAR(50), -- Referrer handling
|
||||||
|
HEADER_X_FRAME_OPTIONS VARCHAR(50), -- Clickjacking protection
|
||||||
|
HEADER_X_CONTENT_TYPE_OPTIONS VARCHAR(50), -- MIME sniffing protection
|
||||||
|
HEADER_X_XSS_PROTECTION VARCHAR(50), -- XSS protection header
|
||||||
|
|
||||||
|
HEADER_SERVER VARCHAR(255), -- Server header (e.g., Apache, Nginx, etc.)
|
||||||
|
HEADER_X_POWERED_BY VARCHAR(255), -- X-Powered-By header (if present)
|
||||||
|
|
||||||
|
TS_LAST_UPDATE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -- Timestamp of the last SSL check
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_INFORMATION__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_INFORMATION (NODE_ID, DOMAIN_ID);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_SECURITY_EVENTS (
|
||||||
|
CHANGE_ID BIGINT AUTO_INCREMENT PRIMARY KEY, -- Unique identifier for the change
|
||||||
|
DOMAIN_ID INT NOT NULL, -- Domain ID, used as a foreign key to EC_DOMAIN
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, -- Timestamp of the change
|
||||||
|
|
||||||
|
CHANGE_ASN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to ASN (Autonomous System Number)
|
||||||
|
CHANGE_CERTIFICATE_FINGERPRINT BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate fingerprint
|
||||||
|
CHANGE_CERTIFICATE_PROFILE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate profile (e.g., algorithm, exchange)
|
||||||
|
CHANGE_CERTIFICATE_SAN BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate SAN (Subject Alternative Name)
|
||||||
|
CHANGE_CERTIFICATE_PUBLIC_KEY BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to SSL certificate public key
|
||||||
|
CHANGE_SECURITY_HEADERS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to security headers
|
||||||
|
CHANGE_IP_ADDRESS BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to IP address
|
||||||
|
CHANGE_SOFTWARE BOOLEAN NOT NULL DEFAULT FALSE, -- Indicates if the change is related to the generator (e.g., web server software)
|
||||||
|
OLD_CERT_TIME_TO_EXPIRY INT, -- Time to expiry of the old certificate in hours, if applicable
|
||||||
|
|
||||||
|
SECURITY_SIGNATURE_BEFORE BLOB NOT NULL, -- Security signature before the change, gzipped json record
|
||||||
|
SECURITY_SIGNATURE_AFTER BLOB NOT NULL -- Security signature after the change, gzipped json record
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__NODE_ID__DOMAIN_ID_IDX ON DOMAIN_SECURITY_EVENTS (NODE_ID, DOMAIN_ID);
|
||||||
|
CREATE INDEX IF NOT EXISTS DOMAIN_SECURITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_SECURITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_AVAILABILITY_EVENTS (
|
||||||
|
DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
AVAILABLE BOOLEAN NOT NULL, -- True if the service is available, false if it is not
|
||||||
|
OUTAGE_TYPE ENUM('NONE', 'TIMEOUT', 'SSL_ERROR', 'DNS_ERROR', 'CONNECTION_ERROR', 'HTTP_CLIENT_ERROR', 'HTTP_SERVER_ERROR', 'UNKNOWN') NOT NULL,
|
||||||
|
HTTP_STATUS_CODE INT, -- HTTP status code if available (e.g., 200, 404, etc.)
|
||||||
|
ERROR_MESSAGE VARCHAR(255), -- Specific error details
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- Timestamp of the last update
|
||||||
|
|
||||||
|
AVAILABILITY_RECORD_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (AVAILABILITY_RECORD_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__DOMAIN_ID_TS_IDX ON DOMAIN_AVAILABILITY_EVENTS (DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_AVAILABILITY_EVENTS__TS_CHANGE_IDX ON DOMAIN_AVAILABILITY_EVENTS (TS_CHANGE);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_INFORMATION (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
ROOT_DOMAIN_NAME VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
NODE_AFFINITY INT NOT NULL, -- Node ID that performs the DNS check, assign randomly across nodes
|
||||||
|
|
||||||
|
DNS_A_RECORDS TEXT, -- JSON array of IPv4 addresses
|
||||||
|
DNS_AAAA_RECORDS TEXT, -- JSON array of IPv6 addresses
|
||||||
|
DNS_CNAME_RECORD VARCHAR(255), -- Canonical name (if applicable)
|
||||||
|
DNS_MX_RECORDS TEXT, -- JSON array of mail exchange records
|
||||||
|
DNS_CAA_RECORDS TEXT, -- Certificate Authority Authorization
|
||||||
|
DNS_TXT_RECORDS TEXT, -- TXT records (SPF, DKIM, verification, etc.)
|
||||||
|
DNS_NS_RECORDS TEXT, -- Name servers (JSON array)
|
||||||
|
DNS_SOA_RECORD TEXT, -- Start of Authority (JSON object)
|
||||||
|
|
||||||
|
TS_LAST_DNS_CHECK TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
TS_NEXT_DNS_CHECK TIMESTAMP NOT NULL,
|
||||||
|
DNS_CHECK_PRIORITY TINYINT DEFAULT 0 -- Priority of the DNS check, in case we want to schedule a refresh sooner
|
||||||
|
) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_INFORMATION__PRIORITY_NEXT_CHECK_IDX ON DOMAIN_DNS_INFORMATION (NODE_AFFINITY, DNS_CHECK_PRIORITY DESC, TS_NEXT_DNS_CHECK);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS DOMAIN_DNS_EVENTS (
|
||||||
|
DNS_ROOT_DOMAIN_ID INT NOT NULL,
|
||||||
|
NODE_ID INT NOT NULL,
|
||||||
|
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
|
||||||
|
-- DNS change type flags
|
||||||
|
CHANGE_A_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv4 address changes
|
||||||
|
CHANGE_AAAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- IPv6 address changes
|
||||||
|
CHANGE_CNAME BOOLEAN NOT NULL DEFAULT FALSE, -- CNAME changes
|
||||||
|
CHANGE_MX_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Mail server changes
|
||||||
|
CHANGE_CAA_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Certificate authority changes
|
||||||
|
CHANGE_TXT_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- TXT record changes (SPF, DKIM, etc.)
|
||||||
|
CHANGE_NS_RECORDS BOOLEAN NOT NULL DEFAULT FALSE, -- Name server changes (big red flag!)
|
||||||
|
CHANGE_SOA_RECORD BOOLEAN NOT NULL DEFAULT FALSE, -- Start of Authority changes
|
||||||
|
|
||||||
|
DNS_SIGNATURE_BEFORE BLOB NOT NULL, -- Compressed JSON snapshot of DNS records before change
|
||||||
|
DNS_SIGNATURE_AFTER BLOB NOT NULL, -- Compressed JSON snapshot of DNS records after change
|
||||||
|
|
||||||
|
DNS_EVENT_ID BIGINT AUTO_INCREMENT,
|
||||||
|
P_KEY_MONTH TINYINT NOT NULL DEFAULT MONTH(TS_CHANGE), -- Month of the change for partitioning
|
||||||
|
PRIMARY KEY (DNS_EVENT_ID, P_KEY_MONTH)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4 COLLATE utf8mb4_bin
|
||||||
|
PARTITION BY RANGE (P_KEY_MONTH) (
|
||||||
|
PARTITION p0 VALUES LESS THAN (1), -- January
|
||||||
|
PARTITION p1 VALUES LESS THAN (2), -- February
|
||||||
|
PARTITION p2 VALUES LESS THAN (3), -- March
|
||||||
|
PARTITION p3 VALUES LESS THAN (4), -- April
|
||||||
|
PARTITION p4 VALUES LESS THAN (5), -- May
|
||||||
|
PARTITION p5 VALUES LESS THAN (6), -- June
|
||||||
|
PARTITION p6 VALUES LESS THAN (7), -- July
|
||||||
|
PARTITION p7 VALUES LESS THAN (8), -- August
|
||||||
|
PARTITION p8 VALUES LESS THAN (9), -- September
|
||||||
|
PARTITION p9 VALUES LESS THAN (10), -- October
|
||||||
|
PARTITION p10 VALUES LESS THAN (11), -- November
|
||||||
|
PARTITION p11 VALUES LESS THAN (12) -- December
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__DNS_ROOT_DOMAIN_ID_TS_IDX ON DOMAIN_DNS_EVENTS (DNS_ROOT_DOMAIN_ID, TS_CHANGE);
|
||||||
|
CREATE INDEX DOMAIN_DNS_EVENTS__TS_CHANGE_IDX ON DOMAIN_DNS_EVENTS (TS_CHANGE);
|
@@ -0,0 +1,6 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_SERIAL_NUMBER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_CERTIFICATE_ISSUER BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,7 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_INFORMATION table
|
||||||
|
-- to make it easier to get more information about the SSL certificate's validity
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_CHAIN_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_HOST_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_INFORMATION ADD COLUMN SSL_DATE_VALID BOOLEAN DEFAULT NULL;
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_INFORMATION;
|
@@ -0,0 +1,5 @@
|
|||||||
|
-- Add additional summary columns to DOMAIN_SECURITY_EVENTS table
|
||||||
|
-- to make it easier to make sense of certificate changes
|
||||||
|
|
||||||
|
ALTER TABLE DOMAIN_SECURITY_EVENTS ADD COLUMN CHANGE_SCHEMA ENUM('NONE', 'HTTP_TO_HTTPS', 'HTTPS_TO_HTTP', 'UNKNOWN') NOT NULL DEFAULT 'UNKNOWN';
|
||||||
|
OPTIMIZE TABLE DOMAIN_SECURITY_EVENTS;
|
@@ -0,0 +1,12 @@
|
|||||||
|
-- Table holding domains to be processed by the NDP in order to figure out whether to add them to
|
||||||
|
-- be crawled.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS NDP_NEW_DOMAINS(
|
||||||
|
DOMAIN_ID INT NOT NULL PRIMARY KEY,
|
||||||
|
STATE ENUM ('NEW', 'ACCEPTED', 'REJECTED') NOT NULL DEFAULT 'NEW',
|
||||||
|
PRIORITY INT NOT NULL DEFAULT 0,
|
||||||
|
TS_CHANGE TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
CHECK_COUNT INT NOT NULL DEFAULT 0
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS NDP_NEW_DOMAINS__STATE_PRIORITY ON NDP_NEW_DOMAINS (STATE, PRIORITY DESC);
|
@@ -0,0 +1,3 @@
|
|||||||
|
-- Migration script to add AUTO_ASSIGN_DOMAINS column to NODE_CONFIGURATION table
|
||||||
|
|
||||||
|
ALTER TABLE NODE_CONFIGURATION ADD COLUMN AUTO_ASSIGN_DOMAINS BOOLEAN NOT NULL DEFAULT TRUE;
|
@@ -5,13 +5,15 @@ import java.util.Collection;
|
|||||||
public enum HtmlFeature {
|
public enum HtmlFeature {
|
||||||
// Note, the first 32 of these features are bit encoded in the database
|
// Note, the first 32 of these features are bit encoded in the database
|
||||||
// so be sure to keep anything that's potentially important toward the top
|
// so be sure to keep anything that's potentially important toward the top
|
||||||
// of the list
|
// of the list; but adding new values will shift the encoded values and break
|
||||||
|
// binary compatibility! Scroll down for a marker where you should add new values
|
||||||
|
// if they need to be accessible from IndexResultScoreCalculator!
|
||||||
|
|
||||||
MEDIA( "special:media"),
|
MEDIA( "special:media"),
|
||||||
JS("special:scripts"),
|
JS("special:scripts"),
|
||||||
AFFILIATE_LINK( "special:affiliate"),
|
AFFILIATE_LINK( "special:affiliate"),
|
||||||
TRACKING("special:tracking"),
|
TRACKING("special:tracking"),
|
||||||
TRACKING_ADTECH("special:ads"), // We'll call this ads for now
|
TRACKING_ADTECH("special:adtech"),
|
||||||
|
|
||||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||||
LONG_URL("special:longurl"),
|
LONG_URL("special:longurl"),
|
||||||
@@ -30,6 +32,15 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
PDF("format:pdf"),
|
PDF("format:pdf"),
|
||||||
|
|
||||||
|
POPOVER("special:popover"),
|
||||||
|
CONSENT("special:consent"),
|
||||||
|
SHORT_DOCUMENT("special:shorty"),
|
||||||
|
THIRD_PARTY_REQUESTS("special:3pr"),
|
||||||
|
|
||||||
|
// Here! It is generally safe to add additional values here without
|
||||||
|
// disrupting the encoded values used by the DocumentValuator
|
||||||
|
// class in the index!
|
||||||
|
|
||||||
/** For fingerprinting and ranking */
|
/** For fingerprinting and ranking */
|
||||||
OPENGRAPH("special:opengraph"),
|
OPENGRAPH("special:opengraph"),
|
||||||
OPENGRAPH_IMAGE("special:opengraph:image"),
|
OPENGRAPH_IMAGE("special:opengraph:image"),
|
||||||
@@ -67,6 +78,7 @@ public enum HtmlFeature {
|
|||||||
|
|
||||||
S3_FEATURE("special:s3"),
|
S3_FEATURE("special:s3"),
|
||||||
|
|
||||||
|
MISSING_DOM_SAMPLE("special:nosample"),
|
||||||
UNKNOWN("special:uncategorized");
|
UNKNOWN("special:uncategorized");
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,11 +6,20 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
public class GsonFactory {
|
public class GsonFactory {
|
||||||
public static Gson get() {
|
public static Gson get() {
|
||||||
return new GsonBuilder()
|
return new GsonBuilder()
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toEpochMilli()))
|
||||||
|
.registerTypeAdapter(Instant.class, (JsonDeserializer<Instant>) (json, typeOfT, context) -> {
|
||||||
|
if (json.isJsonPrimitive() && json.getAsJsonPrimitive().isNumber()) {
|
||||||
|
return Instant.ofEpochMilli(json.getAsLong());
|
||||||
|
} else {
|
||||||
|
throw new JsonParseException("Expected a number for Instant");
|
||||||
|
}
|
||||||
|
})
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||||
|
@@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.process.control;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class ProcessEventLog {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(ProcessEventLog.class);
|
||||||
|
|
||||||
|
private final String serviceName;
|
||||||
|
private final UUID instanceUuid;
|
||||||
|
private final String serviceBase;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ProcessEventLog(HikariDataSource dataSource, ProcessConfiguration configuration) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
this.serviceName = configuration.processName() + ":" + configuration.node();
|
||||||
|
this.instanceUuid = configuration.instanceUuid();
|
||||||
|
this.serviceBase = configuration.processName();
|
||||||
|
|
||||||
|
logger.info("Starting service {} instance {}", serviceName, instanceUuid);
|
||||||
|
|
||||||
|
logEvent("PCS-START", serviceName);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logEvent(Class<?> type, String message) {
|
||||||
|
logEvent(type.getSimpleName(), message);
|
||||||
|
}
|
||||||
|
public void logEvent(String type, String message) {
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
INSERT INTO SERVICE_EVENTLOG(SERVICE_NAME, SERVICE_BASE, INSTANCE, EVENT_TYPE, EVENT_MESSAGE)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, serviceName);
|
||||||
|
stmt.setString(2, serviceBase);
|
||||||
|
stmt.setString(3, instanceUuid.toString());
|
||||||
|
stmt.setString(4, type);
|
||||||
|
stmt.setString(5, Objects.requireNonNull(message, ""));
|
||||||
|
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to log event {}:{}", type, message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -7,7 +7,6 @@ public enum ServiceId {
|
|||||||
Search("search-service"),
|
Search("search-service"),
|
||||||
Index("index-service"),
|
Index("index-service"),
|
||||||
Query("query-service"),
|
Query("query-service"),
|
||||||
Executor("executor-service"),
|
|
||||||
|
|
||||||
Control("control-service"),
|
Control("control-service"),
|
||||||
|
|
||||||
|
@@ -13,6 +13,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
|||||||
import nu.marginalia.util.NamedExecutorFactory;
|
import nu.marginalia.util.NamedExecutorFactory;
|
||||||
|
|
||||||
import java.util.concurrent.Executor;
|
import java.util.concurrent.Executor;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -20,10 +21,15 @@ public class GrpcChannelPoolFactory {
|
|||||||
|
|
||||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||||
private final ServiceRegistryIf serviceRegistryIf;
|
private final ServiceRegistryIf serviceRegistryIf;
|
||||||
private static final Executor executor = NamedExecutorFactory.createFixed("gRPC-Channel-Pool",
|
|
||||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
private static final Executor offloadExecutor = NamedExecutorFactory.createFixed("gRPC-Offload-Pool",
|
|
||||||
Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
private static final Executor executor = useLoom
|
||||||
|
? Executors.newVirtualThreadPerTaskExecutor()
|
||||||
|
: NamedExecutorFactory.createFixed("gRPC-Channel-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||||
|
private static final Executor offloadExecutor = useLoom
|
||||||
|
? Executors.newVirtualThreadPerTaskExecutor()
|
||||||
|
: NamedExecutorFactory.createFixed("gRPC-Offload-Pool", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 32));
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
public GrpcChannelPoolFactory(NodeConfigurationWatcher nodeConfigurationWatcher,
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.service.client;
|
|||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import io.grpc.ManagedChannel;
|
import io.grpc.ManagedChannel;
|
||||||
|
import io.grpc.StatusRuntimeException;
|
||||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||||
@@ -206,6 +207,11 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (var e : exceptions) {
|
for (var e : exceptions) {
|
||||||
|
if (e instanceof StatusRuntimeException se) {
|
||||||
|
throw se; // Re-throw SRE as-is
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there are other exceptions, log them
|
||||||
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
logger.error(grpcMarker, "Failed to call service {}", serviceKey, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,17 +1,21 @@
|
|||||||
package nu.marginalia.service.discovery;
|
package nu.marginalia.service.discovery;
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.monitor.*;
|
import com.google.inject.ImplementedBy;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||||
|
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
|
||||||
|
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
|
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||||
|
|
||||||
/** A service registry that allows services to register themselves and
|
/** A service registry that allows services to register themselves and
|
||||||
* be discovered by other services on the network.
|
* be discovered by other services on the network.
|
||||||
*/
|
*/
|
||||||
|
@ImplementedBy(ZkServiceRegistry.class)
|
||||||
public interface ServiceRegistryIf {
|
public interface ServiceRegistryIf {
|
||||||
/**
|
/**
|
||||||
* Register a service with the registry.
|
* Register a service with the registry.
|
||||||
@@ -57,4 +61,9 @@ public interface ServiceRegistryIf {
|
|||||||
* </ul>
|
* </ul>
|
||||||
* */
|
* */
|
||||||
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
void registerMonitor(ServiceMonitorIf monitor) throws Exception;
|
||||||
|
|
||||||
|
void registerProcess(String processName, int nodeId);
|
||||||
|
void deregisterProcess(String processName, int nodeId);
|
||||||
|
|
||||||
|
InterProcessSemaphoreV2 getSemaphore(String name, int permits) throws Exception;
|
||||||
}
|
}
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
|||||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
import org.apache.curator.framework.CuratorFramework;
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
|
import org.apache.curator.framework.recipes.locks.InterProcessSemaphoreV2;
|
||||||
import org.apache.curator.utils.ZKPaths;
|
import org.apache.curator.utils.ZKPaths;
|
||||||
import org.apache.zookeeper.CreateMode;
|
import org.apache.zookeeper.CreateMode;
|
||||||
import org.apache.zookeeper.Watcher;
|
import org.apache.zookeeper.Watcher;
|
||||||
@@ -256,6 +257,42 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
|||||||
.forPath("/running-instances");
|
.forPath("/running-instances");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void registerProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.create()
|
||||||
|
.creatingParentsIfNeeded()
|
||||||
|
.withMode(CreateMode.EPHEMERAL)
|
||||||
|
.forPath(path);
|
||||||
|
livenessPaths.add(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to register process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void deregisterProcess(String processName, int nodeId) {
|
||||||
|
String path = "/process-locks/" + processName + "/" + nodeId;
|
||||||
|
try {
|
||||||
|
curatorFramework.delete().forPath(path);
|
||||||
|
livenessPaths.remove(path);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to deregister process {} on node {}", processName, nodeId, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InterProcessSemaphoreV2 getSemaphore(String name, int permits) {
|
||||||
|
if (stopped)
|
||||||
|
throw new IllegalStateException("Service registry is stopped, cannot get semaphore " + name);
|
||||||
|
|
||||||
|
String path = "/semaphores/" + name;
|
||||||
|
return new InterProcessSemaphoreV2(curatorFramework, path, permits);
|
||||||
|
}
|
||||||
|
|
||||||
/* Exposed for tests */
|
/* Exposed for tests */
|
||||||
public synchronized void shutDown() {
|
public synchronized void shutDown() {
|
||||||
if (stopped)
|
if (stopped)
|
||||||
|
@@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.service.server;
|
package nu.marginalia.service.server;
|
||||||
|
|
||||||
import io.grpc.Server;
|
import io.grpc.Server;
|
||||||
import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder;
|
import io.grpc.netty.NettyServerBuilder;
|
||||||
import io.grpc.netty.shaded.io.netty.channel.nio.NioEventLoopGroup;
|
import io.netty.channel.nio.NioEventLoopGroup;
|
||||||
import io.grpc.netty.shaded.io.netty.channel.socket.nio.NioServerSocketChannel;
|
import io.netty.channel.socket.nio.NioServerSocketChannel;
|
||||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
@@ -13,9 +13,14 @@ import nu.marginalia.util.NamedExecutorFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
public class GrpcServer {
|
public class GrpcServer {
|
||||||
private final Server server;
|
private final Server server;
|
||||||
|
|
||||||
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
|
|
||||||
public GrpcServer(ServiceConfiguration config,
|
public GrpcServer(ServiceConfiguration config,
|
||||||
ServiceRegistryIf serviceRegistry,
|
ServiceRegistryIf serviceRegistry,
|
||||||
ServicePartition partition,
|
ServicePartition partition,
|
||||||
@@ -26,13 +31,19 @@ public class GrpcServer {
|
|||||||
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
int nThreads = Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16);
|
||||||
|
|
||||||
// Start the gRPC server
|
// Start the gRPC server
|
||||||
|
|
||||||
|
ExecutorService workExecutor = useLoom ?
|
||||||
|
Executors.newVirtualThreadPerTaskExecutor() :
|
||||||
|
NamedExecutorFactory.createFixed("nettyExecutor", nThreads);
|
||||||
|
|
||||||
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
var grpcServerBuilder = NettyServerBuilder.forAddress(new InetSocketAddress(config.bindAddress(), port))
|
||||||
.executor(NamedExecutorFactory.createFixed("nettyExecutor", nThreads))
|
.executor(workExecutor)
|
||||||
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
.workerEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Worker-ELG", nThreads)))
|
||||||
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
.bossEventLoopGroup(new NioEventLoopGroup(nThreads, NamedExecutorFactory.createFixed("Boss-ELG", nThreads)))
|
||||||
.channelType(NioServerSocketChannel.class);
|
.channelType(NioServerSocketChannel.class);
|
||||||
|
|
||||||
for (var grpcService : grpcServices) {
|
for (var grpcService : grpcServices) {
|
||||||
|
|
||||||
if (!grpcService.shouldRegisterService()) {
|
if (!grpcService.shouldRegisterService()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -125,8 +125,7 @@ public class JoobyService {
|
|||||||
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
// Set a cap on the number of worker threads, as Jooby's default value does not seem to consider
|
||||||
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
// multi-tenant servers with high thread counts, and spins up an exorbitant number of threads in that
|
||||||
// scenario
|
// scenario
|
||||||
options.setWorkerThreads(Math.min(128, options.getWorkerThreads()));
|
options.setWorkerThreads(Math.min(16, options.getWorkerThreads()));
|
||||||
|
|
||||||
|
|
||||||
jooby.setServerOptions(options);
|
jooby.setServerOptions(options);
|
||||||
|
|
||||||
|
@@ -7,6 +7,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
@@ -23,6 +24,7 @@
|
|||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
@@ -36,6 +38,16 @@
|
|||||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
@@ -8,6 +8,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||||
@@ -18,6 +19,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||||
@@ -28,6 +30,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||||
@@ -38,6 +41,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||||
@@ -57,6 +61,7 @@
|
|||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -69,6 +74,16 @@
|
|||||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/converter-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/converter-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CONVERTER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
|||||||
import nu.marginalia.executor.upload.UploadDirContents;
|
import nu.marginalia.executor.upload.UploadDirContents;
|
||||||
import nu.marginalia.executor.upload.UploadDirItem;
|
import nu.marginalia.executor.upload.UploadDirItem;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.*;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.service.ServiceId;
|
import nu.marginalia.service.ServiceId;
|
||||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
@@ -25,27 +26,37 @@ import java.net.URISyntaxException;
|
|||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ExecutorClient {
|
public class ExecutorClient {
|
||||||
|
private final MqPersistence persistence;
|
||||||
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
private final GrpcMultiNodeChannelPool<ExecutorApiBlockingStub> channelPool;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(ExecutorClient.class);
|
||||||
private final ServiceRegistryIf registry;
|
private final ServiceRegistryIf registry;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ExecutorClient(ServiceRegistryIf registry,
|
public ExecutorClient(ServiceRegistryIf registry,
|
||||||
|
MqPersistence persistence,
|
||||||
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||||
{
|
{
|
||||||
this.registry = registry;
|
this.registry = registry;
|
||||||
|
this.persistence = persistence;
|
||||||
this.channelPool = grpcChannelPoolFactory
|
this.channelPool = grpcChannelPoolFactory
|
||||||
.createMulti(
|
.createMulti(
|
||||||
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
ServiceKey.forGrpcApi(ExecutorApiGrpc.class, ServicePartition.multi()),
|
||||||
ExecutorApiGrpc::newBlockingStub);
|
ExecutorApiGrpc::newBlockingStub);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||||
|
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void startFsm(int node, String actorName) {
|
public void startFsm(int node, String actorName) {
|
||||||
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
channelPool.call(ExecutorApiBlockingStub::startFsm)
|
||||||
.forNode(node)
|
.forNode(node)
|
||||||
@@ -96,6 +107,16 @@ public class ExecutorClient {
|
|||||||
.build());
|
.build());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long updateNsfwFilters() throws Exception {
|
||||||
|
long msgId = createTrackingTokenMsg("nsfw-filters", 1, Duration.ofHours(6));
|
||||||
|
|
||||||
|
channelPool.call(ExecutorApiBlockingStub::updateNsfwFilters)
|
||||||
|
.forNode(1)
|
||||||
|
.run(RpcUpdateNsfwFilters.newBuilder().setMsgId(msgId).build());
|
||||||
|
|
||||||
|
return msgId;
|
||||||
|
}
|
||||||
|
|
||||||
public ActorRunStates getActorStates(int node) {
|
public ActorRunStates getActorStates(int node) {
|
||||||
try {
|
try {
|
||||||
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
var rs = channelPool.call(ExecutorApiBlockingStub::getActorStates)
|
||||||
@@ -168,7 +189,7 @@ public class ExecutorClient {
|
|||||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||||
|
|
||||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Index, fileStorage.node()));
|
||||||
if (endpoints.isEmpty()) {
|
if (endpoints.isEmpty()) {
|
||||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||||
}
|
}
|
||||||
|
@@ -18,6 +18,8 @@ service ExecutorApi {
|
|||||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||||
|
|
||||||
|
rpc updateNsfwFilters(RpcUpdateNsfwFilters) returns (Empty) {}
|
||||||
|
|
||||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,6 +68,9 @@ message RpcExportRequest {
|
|||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
int64 msgId = 2;
|
int64 msgId = 2;
|
||||||
}
|
}
|
||||||
|
message RpcUpdateNsfwFilters {
|
||||||
|
int64 msgId = 1;
|
||||||
|
}
|
||||||
message RpcFileStorageIdWithDomainName {
|
message RpcFileStorageIdWithDomainName {
|
||||||
int64 fileStorageId = 1;
|
int64 fileStorageId = 1;
|
||||||
string targetDomainName = 2;
|
string targetDomainName = 2;
|
||||||
|
@@ -19,6 +19,8 @@ dependencies {
|
|||||||
implementation project(':code:processes:crawling-process')
|
implementation project(':code:processes:crawling-process')
|
||||||
implementation project(':code:processes:live-crawling-process')
|
implementation project(':code:processes:live-crawling-process')
|
||||||
implementation project(':code:processes:loading-process')
|
implementation project(':code:processes:loading-process')
|
||||||
|
implementation project(':code:processes:ping-process')
|
||||||
|
implementation project(':code:processes:new-domain-process')
|
||||||
implementation project(':code:processes:converting-process')
|
implementation project(':code:processes:converting-process')
|
||||||
implementation project(':code:processes:index-constructor-process')
|
implementation project(':code:processes:index-constructor-process')
|
||||||
|
|
||||||
@@ -37,9 +39,9 @@ dependencies {
|
|||||||
implementation project(':code:functions:link-graph:api')
|
implementation project(':code:functions:link-graph:api')
|
||||||
implementation project(':code:functions:live-capture:api')
|
implementation project(':code:functions:live-capture:api')
|
||||||
implementation project(':code:functions:search-query')
|
implementation project(':code:functions:search-query')
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||||
implementation project(':code:index:index-journal')
|
implementation project(':code:index:index-journal')
|
||||||
|
@@ -2,10 +2,11 @@ package nu.marginalia.actor;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.RpcFsmName;
|
||||||
|
import nu.marginalia.functions.execution.api.RpcProcessId;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -14,18 +15,18 @@ import spark.Spark;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class ActorApi {
|
public class ActorApi {
|
||||||
private final ExecutorActorControlService actors;
|
private final ExecutorActorControlService actors;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
private final MqPersistence mqPersistence;
|
private final MqPersistence mqPersistence;
|
||||||
private final ServiceConfiguration serviceConfiguration;
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@Inject
|
@Inject
|
||||||
public ActorApi(ExecutorActorControlService actors,
|
public ActorApi(ExecutorActorControlService actors,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
MqPersistence mqPersistence,
|
MqPersistence mqPersistence,
|
||||||
ServiceConfiguration serviceConfiguration)
|
ServiceConfiguration serviceConfiguration)
|
||||||
{
|
{
|
||||||
this.actors = actors;
|
this.actors = actors;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.mqPersistence = mqPersistence;
|
this.mqPersistence = mqPersistence;
|
||||||
this.serviceConfiguration = serviceConfiguration;
|
this.serviceConfiguration = serviceConfiguration;
|
||||||
}
|
}
|
||||||
@@ -43,7 +44,7 @@ public class ActorApi {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Object stopProcess(RpcProcessId processId) {
|
public Object stopProcess(RpcProcessId processId) {
|
||||||
ProcessService.ProcessId id = ProcessService.translateExternalIdBase(processId.getProcessId());
|
ProcessSpawnerService.ProcessId id = ProcessSpawnerService.translateExternalIdBase(processId.getProcessId());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
String inbox = id.name().toLowerCase() + ":" + serviceConfiguration.node();
|
||||||
@@ -60,7 +61,7 @@ public class ActorApi {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
processService.kill(id);
|
processSpawnerService.kill(id);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Failed to stop process {}", id, ex);
|
logger.error("Failed to stop process {}", id, ex);
|
||||||
|
@@ -6,12 +6,15 @@ import java.util.Set;
|
|||||||
|
|
||||||
public enum ExecutorActor {
|
public enum ExecutorActor {
|
||||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
UPDATE_NSFW_LISTS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD, NodeProfile.REALTIME),
|
||||||
|
|
||||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
PROC_PING_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
|
PROC_NDP_SPAWNER(NodeProfile.MIXED, NodeProfile.REALTIME),
|
||||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||||
@@ -35,7 +38,8 @@ public enum ExecutorActor {
|
|||||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||||
UPDATE_RSS(NodeProfile.REALTIME);
|
UPDATE_RSS(NodeProfile.REALTIME)
|
||||||
|
;
|
||||||
|
|
||||||
public String id() {
|
public String id() {
|
||||||
return "fsm:" + name().toLowerCase();
|
return "fsm:" + name().toLowerCase();
|
||||||
|
@@ -49,6 +49,8 @@ public class ExecutorActorControlService {
|
|||||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||||
RestoreBackupActor restoreBackupActor,
|
RestoreBackupActor restoreBackupActor,
|
||||||
ConverterMonitorActor converterMonitorFSM,
|
ConverterMonitorActor converterMonitorFSM,
|
||||||
|
NdpMonitorActor ndpMonitorActor,
|
||||||
|
PingMonitorActor pingMonitorActor,
|
||||||
CrawlerMonitorActor crawlerMonitorActor,
|
CrawlerMonitorActor crawlerMonitorActor,
|
||||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||||
LoaderMonitorActor loaderMonitor,
|
LoaderMonitorActor loaderMonitor,
|
||||||
@@ -68,6 +70,7 @@ public class ExecutorActorControlService {
|
|||||||
ExecutorActorStateMachines stateMachines,
|
ExecutorActorStateMachines stateMachines,
|
||||||
MigrateCrawlDataActor migrateCrawlDataActor,
|
MigrateCrawlDataActor migrateCrawlDataActor,
|
||||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||||
|
UpdateNsfwFiltersActor updateNsfwFiltersActor,
|
||||||
UpdateRssActor updateRssActor) throws SQLException {
|
UpdateRssActor updateRssActor) throws SQLException {
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.eventLog = baseServiceParams.eventLog;
|
this.eventLog = baseServiceParams.eventLog;
|
||||||
@@ -88,9 +91,10 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_PING_SPAWNER, pingMonitorActor);
|
||||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||||
|
register(ExecutorActor.PROC_NDP_SPAWNER, ndpMonitorActor);
|
||||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||||
|
|
||||||
@@ -109,6 +113,7 @@ public class ExecutorActorControlService {
|
|||||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||||
|
|
||||||
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
register(ExecutorActor.MIGRATE_CRAWL_DATA, migrateCrawlDataActor);
|
||||||
|
register(ExecutorActor.UPDATE_NSFW_LISTS, updateNsfwFiltersActor);
|
||||||
|
|
||||||
if (serviceConfiguration.node() == 1) {
|
if (serviceConfiguration.node() == 1) {
|
||||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||||
|
@@ -4,11 +4,14 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.*;
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.actor.state.Terminal;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -24,13 +27,13 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||||||
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public static final int MAX_ATTEMPTS = 3;
|
public static final int MAX_ATTEMPTS = 3;
|
||||||
private final String inboxName;
|
private final String inboxName;
|
||||||
private final ProcessService.ProcessId processId;
|
private final ProcessSpawnerService.ProcessId processId;
|
||||||
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
private final int node;
|
private final int node;
|
||||||
|
|
||||||
@@ -50,7 +53,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
for (;;) {
|
for (;;) {
|
||||||
var messages = persistence.eavesdrop(inboxName, 1);
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||||
synchronized (processId) {
|
synchronized (processId) {
|
||||||
processId.wait(5000);
|
processId.wait(5000);
|
||||||
}
|
}
|
||||||
@@ -92,7 +95,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
// We get this exception when the process is cancelled by the user
|
// We get this exception when the process is cancelled by the user
|
||||||
|
|
||||||
processService.kill(processId);
|
processSpawnerService.kill(processId);
|
||||||
setCurrentMessageToDead();
|
setCurrentMessageToDead();
|
||||||
|
|
||||||
yield new Aborted();
|
yield new Aborted();
|
||||||
@@ -112,13 +115,13 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
public AbstractProcessSpawnerActor(Gson gson,
|
public AbstractProcessSpawnerActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
String inboxName,
|
String inboxName,
|
||||||
ProcessService.ProcessId processId) {
|
ProcessSpawnerService.ProcessId processId) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.node = configuration.node();
|
this.node = configuration.node();
|
||||||
this.persistence = persistence;
|
this.persistence = persistence;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.inboxName = inboxName + ":" + node;
|
this.inboxName = inboxName + ":" + node;
|
||||||
this.processId = processId;
|
this.processId = processId;
|
||||||
}
|
}
|
||||||
@@ -149,7 +152,7 @@ public class AbstractProcessSpawnerActor extends RecordActorPrototype {
|
|||||||
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||||
executorService.submit(() -> {
|
executorService.submit(() -> {
|
||||||
try {
|
try {
|
||||||
processService.trigger(processId);
|
processSpawnerService.trigger(processId);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Error in triggering process", e);
|
logger.warn("Error in triggering process", e);
|
||||||
error.set(true);
|
error.set(true);
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class ConverterMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public ConverterMonitorActor(Gson gson,
|
public ConverterMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.CONVERTER_INBOX,
|
ProcessInboxNames.CONVERTER_INBOX,
|
||||||
ProcessService.ProcessId.CONVERTER);
|
ProcessSpawnerService.ProcessId.CONVERTER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class CrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public CrawlerMonitorActor(Gson gson,
|
public CrawlerMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.CRAWLER_INBOX,
|
ProcessInboxNames.CRAWLER_INBOX,
|
||||||
ProcessService.ProcessId.CRAWLER);
|
ProcessSpawnerService.ProcessId.CRAWLER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public ExportTaskMonitorActor(Gson gson,
|
public ExportTaskMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||||
ProcessService.ProcessId.EXPORT_TASKS);
|
ProcessSpawnerService.ProcessId.EXPORT_TASKS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class IndexConstructorMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public IndexConstructorMonitorActor(Gson gson,
|
public IndexConstructorMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX,
|
||||||
ProcessService.ProcessId.INDEX_CONSTRUCTOR);
|
ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,7 +6,7 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -16,13 +16,13 @@ public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public LiveCrawlerMonitorActor(Gson gson,
|
public LiveCrawlerMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence,
|
persistence,
|
||||||
processService,
|
processSpawnerService,
|
||||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
ProcessSpawnerService.ProcessId.LIVE_CRAWLER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -4,9 +4,9 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -17,13 +17,13 @@ public class LoaderMonitorActor extends AbstractProcessSpawnerActor {
|
|||||||
public LoaderMonitorActor(Gson gson,
|
public LoaderMonitorActor(Gson gson,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
MqPersistence persistence,
|
MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
|
|
||||||
super(gson,
|
super(gson,
|
||||||
configuration,
|
configuration,
|
||||||
persistence, processService,
|
persistence, processSpawnerService,
|
||||||
ProcessInboxNames.LOADER_INBOX,
|
ProcessInboxNames.LOADER_INBOX,
|
||||||
ProcessService.ProcessId.LOADER);
|
ProcessSpawnerService.ProcessId.LOADER);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class NdpMonitorActor extends AbstractProcessSpawnerActor {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NdpMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessSpawnerService processSpawnerService) {
|
||||||
|
super(gson,
|
||||||
|
configuration,
|
||||||
|
persistence,
|
||||||
|
processSpawnerService,
|
||||||
|
ProcessInboxNames.NDP_INBOX,
|
||||||
|
ProcessSpawnerService.ProcessId.NDP);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,181 @@
|
|||||||
|
package nu.marginalia.actor.proc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.actor.state.Terminal;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.mqapi.ping.PingRequest;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
|
||||||
|
// Unlike other monitor actors, the ping monitor will not merely wait for a request
|
||||||
|
// to be sent, but send one itself, hence we can't extend AbstractProcessSpawnerActor
|
||||||
|
// but have to reimplement a lot of the same logic ourselves.
|
||||||
|
@Singleton
|
||||||
|
public class PingMonitorActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private final MqPersistence persistence;
|
||||||
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public static final int MAX_ATTEMPTS = 3;
|
||||||
|
private final String inboxName;
|
||||||
|
private final ProcessSpawnerService.ProcessId processId;
|
||||||
|
private final ExecutorService executorService = Executors.newSingleThreadExecutor();
|
||||||
|
private final int node;
|
||||||
|
private final Gson gson;
|
||||||
|
|
||||||
|
public record Initial() implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
|
public record Monitor(int errorAttempts) implements ActorStep {}
|
||||||
|
@Resume(behavior = ActorResumeBehavior.RESTART)
|
||||||
|
public record Run(int attempts) implements ActorStep {}
|
||||||
|
@Terminal
|
||||||
|
public record Aborted() implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch (self) {
|
||||||
|
case Initial i -> {
|
||||||
|
PingRequest request = new PingRequest();
|
||||||
|
persistence.sendNewMessage(inboxName, null, null,
|
||||||
|
"PingRequest",
|
||||||
|
gson.toJson(request),
|
||||||
|
null);
|
||||||
|
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
case Monitor(int errorAttempts) -> {
|
||||||
|
for (;;) {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty() && !processSpawnerService.isRunning(processId)) {
|
||||||
|
synchronized (processId) {
|
||||||
|
processId.wait(5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||||
|
yield new Monitor(0);
|
||||||
|
}
|
||||||
|
// else continue
|
||||||
|
} else {
|
||||||
|
// Special: Associate this thread with the message so that we can get tracking
|
||||||
|
MqMessageHandlerRegistry.register(messages.getFirst().msgId());
|
||||||
|
|
||||||
|
yield new Run(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run(int attempts) -> {
|
||||||
|
try {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
var exec = new TaskExecution();
|
||||||
|
long endTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
if (exec.isError()) {
|
||||||
|
if (attempts < MAX_ATTEMPTS)
|
||||||
|
yield new Run(attempts + 1);
|
||||||
|
else
|
||||||
|
yield new Error();
|
||||||
|
}
|
||||||
|
else if (endTime - startTime < TimeUnit.SECONDS.toMillis(1)) {
|
||||||
|
// To avoid boot loops, we transition to error if the process
|
||||||
|
// didn't run for longer than 1 seconds. This might happen if
|
||||||
|
// the process crashes before it can reach the heartbeat and inbox
|
||||||
|
// stages of execution. In this case it would not report having acted
|
||||||
|
// on its message, and the process would be restarted forever without
|
||||||
|
// the attempts counter incrementing.
|
||||||
|
yield new Error("Process terminated within 1 seconds of starting");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
// We get this exception when the process is cancelled by the user
|
||||||
|
|
||||||
|
processSpawnerService.kill(processId);
|
||||||
|
setCurrentMessageToDead();
|
||||||
|
|
||||||
|
yield new Aborted();
|
||||||
|
}
|
||||||
|
|
||||||
|
yield new Monitor(attempts);
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
return "Spawns a(n) " + processId + " process and monitors its inbox for messages";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public PingMonitorActor(Gson gson,
|
||||||
|
ServiceConfiguration configuration,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessSpawnerService processSpawnerService) throws SQLException {
|
||||||
|
super(gson);
|
||||||
|
this.gson = gson;
|
||||||
|
this.node = configuration.node();
|
||||||
|
this.persistence = persistence;
|
||||||
|
this.processSpawnerService = processSpawnerService;
|
||||||
|
this.inboxName = ProcessInboxNames.PING_INBOX + ":" + node;
|
||||||
|
this.processId = ProcessSpawnerService.ProcessId.PING;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sets the message to dead in the database to avoid
|
||||||
|
* the service respawning on the same task when we
|
||||||
|
* re-enable this actor */
|
||||||
|
private void setCurrentMessageToDead() {
|
||||||
|
try {
|
||||||
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
|
if (messages.isEmpty()) // Possibly a race condition where the task is already finished
|
||||||
|
return;
|
||||||
|
|
||||||
|
var theMessage = messages.iterator().next();
|
||||||
|
persistence.updateMessageState(theMessage.msgId(), MqMessageState.DEAD);
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Tried but failed to set the message for " + processId + " to dead", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Encapsulates the execution of the process in a separate thread so that
|
||||||
|
* we can interrupt the thread if the process is cancelled */
|
||||||
|
private class TaskExecution {
|
||||||
|
private final AtomicBoolean error = new AtomicBoolean(false);
|
||||||
|
public TaskExecution() throws ExecutionException, InterruptedException {
|
||||||
|
// Run this call in a separate thread so that this thread can be interrupted waiting for it
|
||||||
|
executorService.submit(() -> {
|
||||||
|
try {
|
||||||
|
processSpawnerService.trigger(processId);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Error in triggering process", e);
|
||||||
|
error.set(true);
|
||||||
|
}
|
||||||
|
}).get(); // Wait for the process to start
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isError() {
|
||||||
|
return error.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -8,7 +8,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
|||||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit;
|
|||||||
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final ServiceEventLog eventLogService;
|
private final ServiceEventLog eventLogService;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
private final int node;
|
private final int node;
|
||||||
@@ -49,7 +49,7 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
var processId = heartbeat.getProcessId();
|
var processId = heartbeat.getProcessId();
|
||||||
if (null == processId) continue;
|
if (null == processId) continue;
|
||||||
|
|
||||||
if (processService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
if (processSpawnerService.isRunning(processId) && heartbeat.lastSeenMillis() < 10_000)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
flagProcessAsStopped(heartbeat);
|
flagProcessAsStopped(heartbeat);
|
||||||
@@ -72,12 +72,12 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
public ProcessLivenessMonitorActor(Gson gson,
|
public ProcessLivenessMonitorActor(Gson gson,
|
||||||
ServiceEventLog eventLogService,
|
ServiceEventLog eventLogService,
|
||||||
ServiceConfiguration configuration,
|
ServiceConfiguration configuration,
|
||||||
ProcessService processService,
|
ProcessSpawnerService processSpawnerService,
|
||||||
HikariDataSource dataSource) {
|
HikariDataSource dataSource) {
|
||||||
super(gson);
|
super(gson);
|
||||||
this.node = configuration.node();
|
this.node = configuration.node();
|
||||||
this.eventLogService = eventLogService;
|
this.eventLogService = eventLogService;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -208,8 +208,8 @@ public class ProcessLivenessMonitorActor extends RecordActorPrototype {
|
|||||||
public boolean isRunning() {
|
public boolean isRunning() {
|
||||||
return "RUNNING".equals(status);
|
return "RUNNING".equals(status);
|
||||||
}
|
}
|
||||||
public ProcessService.ProcessId getProcessId() {
|
public ProcessSpawnerService.ProcessId getProcessId() {
|
||||||
return ProcessService.translateExternalIdBase(processBase);
|
return ProcessSpawnerService.translateExternalIdBase(processBase);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -47,6 +47,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
private final Path feedPath = WmsaHome.getHomePath().resolve("data/scrape-urls.txt");
|
||||||
|
|
||||||
|
private static boolean insertFoundDomains = Boolean.getBoolean("loader.insertFoundDomains");
|
||||||
|
|
||||||
public record Initial() implements ActorStep {}
|
public record Initial() implements ActorStep {}
|
||||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||||
public record Wait(String ts) implements ActorStep {}
|
public record Wait(String ts) implements ActorStep {}
|
||||||
@@ -57,6 +59,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
|||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
return switch(self) {
|
return switch(self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
|
if (!insertFoundDomains) yield new Error("Domain insertion prohibited, aborting");
|
||||||
|
|
||||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||||
yield new Error("Invalid node profile for RSS update");
|
yield new Error("Invalid node profile for RSS update");
|
||||||
}
|
}
|
||||||
|
@@ -3,11 +3,11 @@ package nu.marginalia.actor.task;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.state.ActorControlFlowException;
|
import nu.marginalia.actor.state.ActorControlFlowException;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
|
||||||
import nu.marginalia.process.ProcessService;
|
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -20,13 +20,13 @@ public class ActorProcessWatcher {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
private static final Logger logger = LoggerFactory.getLogger(ActorProcessWatcher.class);
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
private final ProcessService processService;
|
private final ProcessSpawnerService processSpawnerService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ActorProcessWatcher(MqPersistence persistence,
|
public ActorProcessWatcher(MqPersistence persistence,
|
||||||
ProcessService processService) {
|
ProcessSpawnerService processSpawnerService) {
|
||||||
this.persistence = persistence;
|
this.persistence = persistence;
|
||||||
this.processService = processService;
|
this.processSpawnerService = processSpawnerService;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wait for a process to start, and then wait for a response from the process,
|
/** Wait for a process to start, and then wait for a response from the process,
|
||||||
@@ -36,7 +36,7 @@ public class ActorProcessWatcher {
|
|||||||
* <p>
|
* <p>
|
||||||
* When interrupted, the process is killed and the message is marked as dead.
|
* When interrupted, the process is killed and the message is marked as dead.
|
||||||
*/
|
*/
|
||||||
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long msgId)
|
public MqMessage waitResponse(MqOutbox outbox, ProcessSpawnerService.ProcessId processId, long msgId)
|
||||||
throws ActorControlFlowException, InterruptedException, SQLException
|
throws ActorControlFlowException, InterruptedException, SQLException
|
||||||
{
|
{
|
||||||
// enums values only have a single instance,
|
// enums values only have a single instance,
|
||||||
@@ -65,7 +65,7 @@ public class ActorProcessWatcher {
|
|||||||
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
// This will prevent the monitor process from attempting to respawn the process as we kill it
|
||||||
|
|
||||||
outbox.flagAsDead(msgId);
|
outbox.flagAsDead(msgId);
|
||||||
processService.kill(processId);
|
processSpawnerService.kill(processId);
|
||||||
|
|
||||||
logger.info("Process {} killed due to interrupt", processId);
|
logger.info("Process {} killed due to interrupt", processId);
|
||||||
}
|
}
|
||||||
@@ -94,12 +94,12 @@ public class ActorProcessWatcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Wait the specified time for the specified process to start running (does not start the process) */
|
/** Wait the specified time for the specified process to start running (does not start the process) */
|
||||||
private boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
private boolean waitForProcess(ProcessSpawnerService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||||
|
|
||||||
// Wait for process to start
|
// Wait for process to start
|
||||||
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||||
while (System.currentTimeMillis() < deadline) {
|
while (System.currentTimeMillis() < deadline) {
|
||||||
if (processService.isRunning(processId))
|
if (processSpawnerService.isRunning(processId))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
TimeUnit.MILLISECONDS.sleep(100);
|
TimeUnit.MILLISECONDS.sleep(100);
|
||||||
|
@@ -12,7 +12,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||||
import nu.marginalia.sideload.SideloadHelper;
|
import nu.marginalia.sideload.SideloadHelper;
|
||||||
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
import nu.marginalia.sideload.StackExchangeSideloadHelper;
|
||||||
@@ -218,7 +218,7 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
case ConvertWait(FileStorageId destFid, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Converter failed");
|
yield new Error("Converter failed");
|
||||||
|
@@ -18,7 +18,7 @@ import nu.marginalia.mqapi.index.IndexName;
|
|||||||
import nu.marginalia.mqapi.loading.LoadRequest;
|
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
@@ -95,7 +95,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) when msgId < 0 ->
|
||||||
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
new Convert(crawlId, processedId, mqConverterOutbox.sendAsync(ConvertRequest.forCrawlData(crawlId, processedId)));
|
||||||
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
case Convert(FileStorageId crawlId, FileStorageId processedId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessService.ProcessId.CONVERTER, msgId);
|
var rsp = processWatcher.waitResponse(mqConverterOutbox, ProcessSpawnerService.ProcessId.CONVERTER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Converter failed");
|
yield new Error("Converter failed");
|
||||||
@@ -129,7 +129,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
yield new Load(processedIds, id);
|
yield new Load(processedIds, id);
|
||||||
}
|
}
|
||||||
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
case Load(List<FileStorageId> processedIds, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessService.ProcessId.LOADER, msgId);
|
var rsp = processWatcher.waitResponse(mqLoaderOutbox, ProcessSpawnerService.ProcessId.LOADER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Loader failed");
|
yield new Error("Loader failed");
|
||||||
@@ -165,7 +165,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
case ReindexFwd(long id) when id < 0 -> new ReindexFwd(createIndex(IndexName.FORWARD));
|
||||||
case ReindexFwd(long id) -> {
|
case ReindexFwd(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Forward index construction failed");
|
yield new Error("Forward index construction failed");
|
||||||
@@ -174,7 +174,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
case ReindexFull(long id) when id < 0 -> new ReindexFull(createIndex(IndexName.REVERSE_FULL));
|
||||||
case ReindexFull(long id) -> {
|
case ReindexFull(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Full index construction failed");
|
yield new Error("Full index construction failed");
|
||||||
@@ -183,7 +183,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
|||||||
}
|
}
|
||||||
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
case ReindexPrio(long id) when id < 0 -> new ReindexPrio(createIndex(IndexName.REVERSE_PRIO));
|
||||||
case ReindexPrio(long id) -> {
|
case ReindexPrio(long id) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id);
|
var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessSpawnerService.ProcessId.INDEX_CONSTRUCTOR, id);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK)
|
if (rsp.state() != MqMessageState.OK)
|
||||||
yield new Error("Prio index construction failed");
|
yield new Error("Prio index construction failed");
|
||||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
@@ -76,7 +76,7 @@ public class CrawlActor extends RecordActorPrototype {
|
|||||||
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
case Crawl (long msgId, FileStorageId fid, boolean cascadeLoad) -> {
|
||||||
var rsp = processWatcher.waitResponse(
|
var rsp = processWatcher.waitResponse(
|
||||||
mqCrawlerOutbox,
|
mqCrawlerOutbox,
|
||||||
ProcessService.ProcessId.CRAWLER,
|
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||||
msgId);
|
msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -55,7 +55,7 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -54,7 +54,7 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -52,7 +52,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
|||||||
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
yield new Run(crawlId, destId, size, ctFilter, name, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
case Run(_, FileStorageId destId, _, _, _, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
|||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageState;
|
import nu.marginalia.storage.model.FileStorageState;
|
||||||
@@ -52,7 +52,7 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
|||||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
storageService.flagFileForDeletion(destId);
|
storageService.flagFileForDeletion(destId);
|
||||||
|
@@ -13,7 +13,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -44,7 +44,6 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActorStep transition(ActorStep self) throws Exception {
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
logger.info("{}", self);
|
|
||||||
return switch (self) {
|
return switch (self) {
|
||||||
case Initial() -> {
|
case Initial() -> {
|
||||||
yield new Monitor("-");
|
yield new Monitor("-");
|
||||||
@@ -75,7 +74,7 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
|||||||
yield new LiveCrawl(feedsHash, id);
|
yield new LiveCrawl(feedsHash, id);
|
||||||
}
|
}
|
||||||
case LiveCrawl(String feedsHash, long msgId) -> {
|
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessSpawnerService.ProcessId.LIVE_CRAWLER, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Crawler failed");
|
yield new Error("Crawler failed");
|
||||||
|
@@ -11,7 +11,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
@@ -51,7 +51,7 @@ public class RecrawlSingleDomainActor extends RecordActorPrototype {
|
|||||||
case Crawl (long msgId) -> {
|
case Crawl (long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(
|
var rsp = processWatcher.waitResponse(
|
||||||
mqCrawlerOutbox,
|
mqCrawlerOutbox,
|
||||||
ProcessService.ProcessId.CRAWLER,
|
ProcessSpawnerService.ProcessId.CRAWLER,
|
||||||
msgId);
|
msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
|
@@ -9,7 +9,7 @@ import nu.marginalia.mq.MqMessageState;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessSpawnerService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
|||||||
yield new Run(newMsgId);
|
yield new Run(newMsgId);
|
||||||
}
|
}
|
||||||
case Run(long msgId) -> {
|
case Run(long msgId) -> {
|
||||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessSpawnerService.ProcessId.EXPORT_TASKS, msgId);
|
||||||
|
|
||||||
if (rsp.state() != MqMessageState.OK) {
|
if (rsp.state() != MqMessageState.OK) {
|
||||||
yield new Error("Exporter failed");
|
yield new Error("Exporter failed");
|
||||||
|
@@ -0,0 +1,60 @@
|
|||||||
|
package nu.marginalia.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class UpdateNsfwFiltersActor extends RecordActorPrototype {
|
||||||
|
private final ServiceConfiguration serviceConfiguration;
|
||||||
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
|
private final MqPersistence persistence;
|
||||||
|
|
||||||
|
public record Initial(long respondMsgId) implements ActorStep {}
|
||||||
|
public record Run(long respondMsgId) implements ActorStep {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ActorStep transition(ActorStep self) throws Exception {
|
||||||
|
return switch(self) {
|
||||||
|
case Initial(long respondMsgId) -> {
|
||||||
|
if (serviceConfiguration.node() != 1) {
|
||||||
|
persistence.updateMessageState(respondMsgId, MqMessageState.ERR);
|
||||||
|
yield new Error("This actor can only run on node 1");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new Run(respondMsgId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case Run(long respondMsgId) -> {
|
||||||
|
nsfwDomainFilter.fetchLists();
|
||||||
|
persistence.updateMessageState(respondMsgId, MqMessageState.OK);
|
||||||
|
yield new End();
|
||||||
|
}
|
||||||
|
default -> new Error();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String describe() {
|
||||||
|
return "Sync NSFW filters";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public UpdateNsfwFiltersActor(Gson gson,
|
||||||
|
ServiceConfiguration serviceConfiguration,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter,
|
||||||
|
MqPersistence persistence)
|
||||||
|
{
|
||||||
|
super(gson);
|
||||||
|
this.serviceConfiguration = serviceConfiguration;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
|
this.persistence = persistence;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.execution;
|
package nu.marginalia.execution;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.actor.ExecutorActor;
|
import nu.marginalia.actor.ExecutorActor;
|
||||||
import nu.marginalia.actor.ExecutorActorControlService;
|
import nu.marginalia.actor.ExecutorActorControlService;
|
||||||
@@ -36,7 +37,7 @@ public class ExecutorCrawlGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -52,7 +53,7 @@ public class ExecutorCrawlGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,7 +67,7 @@ public class ExecutorCrawlGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,7 +81,7 @@ public class ExecutorCrawlGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,7 +99,7 @@ public class ExecutorCrawlGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.execution;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.actor.ExecutorActor;
|
import nu.marginalia.actor.ExecutorActor;
|
||||||
import nu.marginalia.actor.ExecutorActorControlService;
|
import nu.marginalia.actor.ExecutorActorControlService;
|
||||||
@@ -38,7 +39,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,7 +58,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,7 +74,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,7 +88,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,7 +100,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,14 +115,14 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||||
if (serviceConfiguration.node() != 1) {
|
if (serviceConfiguration.node() != 1) {
|
||||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
responseObserver.onError(Status.UNAVAILABLE.withDescription("Export all atags is only available on node 1").asRuntimeException());
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||||
@@ -131,7 +132,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,7 +146,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -159,7 +160,7 @@ public class ExecutorExportGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.execution;
|
package nu.marginalia.execution;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.actor.ActorApi;
|
import nu.marginalia.actor.ActorApi;
|
||||||
@@ -10,6 +11,7 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
|||||||
import nu.marginalia.actor.task.DownloadSampleActor;
|
import nu.marginalia.actor.task.DownloadSampleActor;
|
||||||
import nu.marginalia.actor.task.RestoreBackupActor;
|
import nu.marginalia.actor.task.RestoreBackupActor;
|
||||||
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
import nu.marginalia.actor.task.TriggerAdjacencyCalculationActor;
|
||||||
|
import nu.marginalia.actor.task.UpdateNsfwFiltersActor;
|
||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.*;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import nu.marginalia.service.server.DiscoverableService;
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
@@ -57,7 +59,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,7 +71,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,7 +83,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,7 +97,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,7 +113,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,7 +129,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,7 +204,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -228,7 +230,7 @@ public class ExecutorGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -263,4 +265,19 @@ public class ExecutorGrpcService
|
|||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void updateNsfwFilters(RpcUpdateNsfwFilters request, StreamObserver<Empty> responseObserver) {
|
||||||
|
logger.info("Got request {}", request);
|
||||||
|
try {
|
||||||
|
actorControlService.startFrom(ExecutorActor.UPDATE_NSFW_LISTS,
|
||||||
|
new UpdateNsfwFiltersActor.Initial(request.getMsgId()));
|
||||||
|
|
||||||
|
responseObserver.onNext(Empty.getDefaultInstance());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Failed to update nsfw filters", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.execution;
|
package nu.marginalia.execution;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.actor.ExecutorActor;
|
import nu.marginalia.actor.ExecutorActor;
|
||||||
import nu.marginalia.actor.ExecutorActorControlService;
|
import nu.marginalia.actor.ExecutorActorControlService;
|
||||||
@@ -33,7 +34,7 @@ public class ExecutorSideloadGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -48,7 +49,7 @@ public class ExecutorSideloadGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,7 +64,7 @@ public class ExecutorSideloadGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,7 +79,7 @@ public class ExecutorSideloadGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -93,7 +94,7 @@ public class ExecutorSideloadGrpcService
|
|||||||
responseObserver.onCompleted();
|
responseObserver.onCompleted();
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -8,6 +8,8 @@ import nu.marginalia.crawl.CrawlerMain;
|
|||||||
import nu.marginalia.index.IndexConstructorMain;
|
import nu.marginalia.index.IndexConstructorMain;
|
||||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||||
import nu.marginalia.loading.LoaderMain;
|
import nu.marginalia.loading.LoaderMain;
|
||||||
|
import nu.marginalia.ndp.NdpMain;
|
||||||
|
import nu.marginalia.ping.PingMain;
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import nu.marginalia.task.ExportTasksMain;
|
import nu.marginalia.task.ExportTasksMain;
|
||||||
@@ -27,7 +29,7 @@ import java.util.List;
|
|||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ProcessService {
|
public class ProcessSpawnerService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
private final Marker processMarker = MarkerFactory.getMarker("PROCESS");
|
||||||
|
|
||||||
@@ -41,6 +43,7 @@ public class ProcessService {
|
|||||||
return switch (id) {
|
return switch (id) {
|
||||||
case "converter" -> ProcessId.CONVERTER;
|
case "converter" -> ProcessId.CONVERTER;
|
||||||
case "crawler" -> ProcessId.CRAWLER;
|
case "crawler" -> ProcessId.CRAWLER;
|
||||||
|
case "ping" -> ProcessId.PING;
|
||||||
case "loader" -> ProcessId.LOADER;
|
case "loader" -> ProcessId.LOADER;
|
||||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||||
@@ -50,10 +53,12 @@ public class ProcessService {
|
|||||||
|
|
||||||
public enum ProcessId {
|
public enum ProcessId {
|
||||||
CRAWLER(CrawlerMain.class),
|
CRAWLER(CrawlerMain.class),
|
||||||
|
PING(PingMain.class),
|
||||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||||
CONVERTER(ConverterMain.class),
|
CONVERTER(ConverterMain.class),
|
||||||
LOADER(LoaderMain.class),
|
LOADER(LoaderMain.class),
|
||||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||||
|
NDP(NdpMain.class),
|
||||||
EXPORT_TASKS(ExportTasksMain.class),
|
EXPORT_TASKS(ExportTasksMain.class),
|
||||||
;
|
;
|
||||||
|
|
||||||
@@ -68,6 +73,8 @@ public class ProcessService {
|
|||||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||||
|
case PING -> "PING_PROCESS_OPTS";
|
||||||
|
case NDP -> "NDP_PROCESS_OPTS";
|
||||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||||
};
|
};
|
||||||
@@ -81,7 +88,7 @@ public class ProcessService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ProcessService(BaseServiceParams params) {
|
public ProcessSpawnerService(BaseServiceParams params) {
|
||||||
this.eventLog = params.eventLog;
|
this.eventLog = params.eventLog;
|
||||||
this.node = params.configuration.node();
|
this.node = params.configuration.node();
|
||||||
}
|
}
|
@@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.executor;
|
package nu.marginalia.svc;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
@@ -1,5 +1,5 @@
|
|||||||
The execution subsystem is responsible for the execution of long running tasks on each
|
The execution subsystem is responsible for the execution of long running tasks on each
|
||||||
index node. It lives in the [executor-service](../services-core/executor-service) module.
|
index node. It lives in the [index-service](../services-core/index-service) module.
|
||||||
|
|
||||||
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
It accomplishes this using the [message queue and actor library](../libraries/message-queue/),
|
||||||
which permits program state to survive crashes and reboots.
|
which permits program state to survive crashes and reboots.
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.executor;
|
package nu.marginalia.svc;
|
||||||
|
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
@@ -27,10 +27,12 @@ public class DbBrowseDomainsRandom {
|
|||||||
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
public List<BrowseResult> getRandomDomains(int count, DomainBlacklist blacklist, int set) {
|
||||||
|
|
||||||
final String q = """
|
final String q = """
|
||||||
SELECT DOMAIN_ID, DOMAIN_NAME, INDEXED
|
SELECT EC_RANDOM_DOMAINS.DOMAIN_ID, DOMAIN_NAME, INDEXED
|
||||||
FROM EC_RANDOM_DOMAINS
|
FROM EC_RANDOM_DOMAINS
|
||||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID
|
||||||
|
LEFT JOIN DOMAIN_AVAILABILITY_INFORMATION DAI ON DAI.DOMAIN_ID=EC_RANDOM_DOMAINS.DOMAIN_ID
|
||||||
WHERE STATE<2
|
WHERE STATE<2
|
||||||
|
AND SERVER_AVAILABLE
|
||||||
AND DOMAIN_SET=?
|
AND DOMAIN_SET=?
|
||||||
AND DOMAIN_ALIAS IS NULL
|
AND DOMAIN_ALIAS IS NULL
|
||||||
ORDER BY RAND()
|
ORDER BY RAND()
|
||||||
|
@@ -2,6 +2,8 @@ package nu.marginalia.api.domains;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.api.domains.model.DomainInformation;
|
||||||
|
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
@@ -10,16 +12,19 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
import nu.marginalia.api.domains.model.*;
|
import java.util.concurrent.Future;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class DomainInfoClient {
|
public class DomainInfoClient {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(DomainInfoClient.class);
|
||||||
|
|
||||||
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
private final GrpcSingleNodeChannelPool<DomainInfoAPIGrpc.DomainInfoAPIBlockingStub> channelPool;
|
||||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
|
||||||
|
|
||||||
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
|
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
public DomainInfoClient(GrpcChannelPoolFactory factory) {
|
||||||
|
@@ -0,0 +1,114 @@
|
|||||||
|
package nu.marginalia.api.domsample;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import io.grpc.Status;
|
||||||
|
import io.grpc.StatusRuntimeException;
|
||||||
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
|
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||||
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class DomSampleClient {
|
||||||
|
private final GrpcSingleNodeChannelPool<DomSampleApiGrpc.DomSampleApiBlockingStub> channelPool;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleClient.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleClient(GrpcChannelPoolFactory factory) {
|
||||||
|
|
||||||
|
// The client is only interested in the primary node
|
||||||
|
var key = ServiceKey.forGrpcApi(DomSampleApiGrpc.class, ServicePartition.any());
|
||||||
|
this.channelPool = factory.createSingle(key, DomSampleApiGrpc::newBlockingStub);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<RpcDomainSample> getSample(String domainName) {
|
||||||
|
try {
|
||||||
|
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
|
||||||
|
return Optional.of(val);
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||||
|
logger.error("Failed to fetch DOM sample", sre);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<RpcDomainSampleRequests> getSampleRequests(String domainName) {
|
||||||
|
try {
|
||||||
|
var val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSampleRequests)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
|
||||||
|
return Optional.of(val);
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
if (sre.getStatus() != Status.NOT_FOUND) {
|
||||||
|
logger.error("Failed to fetch DOM sample", sre);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasSample(String domainName) {
|
||||||
|
try {
|
||||||
|
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||||
|
.getAnswer();
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public CompletableFuture<Boolean> hasSample(String domainName, ExecutorService executor) {
|
||||||
|
try {
|
||||||
|
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::hasSample)
|
||||||
|
.async(executor)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build())
|
||||||
|
.thenApply(RpcBooleanRsp::getAnswer);
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
return CompletableFuture.completedFuture(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public CompletableFuture<RpcDomainSample> getSampleAsync(String domainName, ExecutorService executorService) {
|
||||||
|
return channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getSample)
|
||||||
|
.async(executorService)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<RpcDomainSample> getAllSamples(String domainName) {
|
||||||
|
try {
|
||||||
|
Iterator<RpcDomainSample> val = channelPool.call(DomSampleApiGrpc.DomSampleApiBlockingStub::getAllSamples)
|
||||||
|
.run(RpcDomainName.newBuilder().setDomainName(domainName).build());
|
||||||
|
|
||||||
|
List<RpcDomainSample> ret = new ArrayList<>();
|
||||||
|
val.forEachRemaining(ret::add);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
catch (StatusRuntimeException sre) {
|
||||||
|
logger.error("Failed to fetch DOM sample");
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||||
|
return channelPool.awaitChannel(duration);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -11,6 +11,7 @@ import nu.marginalia.service.discovery.property.ServicePartition;
|
|||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -23,7 +24,9 @@ import java.util.function.BiConsumer;
|
|||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class FeedsClient {
|
public class FeedsClient {
|
||||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
|
private static final ExecutorService executorService = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||||
|
|
||||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||||
private final MqOutbox updateFeedsOutbox;
|
private final MqOutbox updateFeedsOutbox;
|
||||||
|
|
||||||
@@ -59,6 +62,11 @@ public class FeedsClient {
|
|||||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean waitReady(Duration duration) throws InterruptedException {
|
||||||
|
return channelPool.awaitChannel(duration);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||||
public String getFeedDataHash() {
|
public String getFeedDataHash() {
|
||||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||||
|
@@ -0,0 +1,47 @@
|
|||||||
|
syntax="proto3";
|
||||||
|
package nu.marginalia.api.domsample;
|
||||||
|
|
||||||
|
option java_package="nu.marginalia.api.domsample";
|
||||||
|
option java_multiple_files=true;
|
||||||
|
|
||||||
|
|
||||||
|
service DomSampleApi {
|
||||||
|
rpc getSample(RpcDomainName) returns (RpcDomainSample) {}
|
||||||
|
rpc getSampleRequests(RpcDomainName) returns (RpcDomainSampleRequests) {}
|
||||||
|
rpc hasSample(RpcDomainName) returns (RpcBooleanRsp) {}
|
||||||
|
rpc getAllSamples(RpcDomainName) returns (stream RpcDomainSample) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcDomainName {
|
||||||
|
string domainName = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcBooleanRsp {
|
||||||
|
bool answer = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcDomainSampleRequests {
|
||||||
|
string domainName = 1;
|
||||||
|
string url = 2;
|
||||||
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcDomainSample {
|
||||||
|
string domainName = 1;
|
||||||
|
string url = 2;
|
||||||
|
bytes htmlSampleZstd = 3;
|
||||||
|
bool accepted_popover = 4;
|
||||||
|
repeated RpcOutgoingRequest outgoingRequests = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcOutgoingRequest {
|
||||||
|
RequestMethod method = 1;
|
||||||
|
int64 timestamp = 2;
|
||||||
|
string url = 3;
|
||||||
|
|
||||||
|
enum RequestMethod {
|
||||||
|
GET = 0;
|
||||||
|
POST = 1;
|
||||||
|
OTHER = 2;
|
||||||
|
};
|
||||||
|
}
|
@@ -22,6 +22,7 @@ dependencies {
|
|||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:libraries:blocking-thread-pool')
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:libraries:domain-lock')
|
||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
implementation project(':code:processes:crawling-process:ft-content-type')
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
@@ -30,10 +31,12 @@ dependencies {
|
|||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.slop
|
implementation libs.slop
|
||||||
|
implementation libs.zstd
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.httpclient
|
||||||
implementation libs.wiremock
|
implementation libs.wiremock
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
|
@@ -0,0 +1,176 @@
|
|||||||
|
package nu.marginalia.domsample;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.Zstd;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
|
import io.grpc.Status;
|
||||||
|
import io.grpc.stub.StreamObserver;
|
||||||
|
import nu.marginalia.api.domsample.*;
|
||||||
|
import nu.marginalia.domsample.db.DomSampleDb;
|
||||||
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class DomSampleGrpcService
|
||||||
|
extends DomSampleApiGrpc.DomSampleApiImplBase
|
||||||
|
implements DiscoverableService
|
||||||
|
{
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleGrpcService.class);
|
||||||
|
|
||||||
|
private final DomSampleDb domSampleDb;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DomSampleGrpcService(DomSampleDb domSampleDb) {
|
||||||
|
this.domSampleDb = domSampleDb;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getSample(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
if (dbRecords.isEmpty()) {
|
||||||
|
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab the first sample
|
||||||
|
RpcDomainSample.Builder response = convertFullSample(dbRecords.getFirst());
|
||||||
|
|
||||||
|
responseObserver.onNext(response.build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error in getSample()", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getSampleRequests(RpcDomainName request, StreamObserver<RpcDomainSampleRequests> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
if (dbRecords.isEmpty()) {
|
||||||
|
responseObserver.onError(Status.NOT_FOUND.withDescription("No sample found").asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab the first sample
|
||||||
|
RpcDomainSampleRequests.Builder response = convertRequestData(dbRecords.getFirst());
|
||||||
|
|
||||||
|
responseObserver.onNext(response.build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error in getSample()", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void hasSample(RpcDomainName request, StreamObserver<RpcBooleanRsp> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
responseObserver.onNext(RpcBooleanRsp.newBuilder()
|
||||||
|
.setAnswer(domSampleDb.hasSample(domainName)).build());
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getAllSamples(RpcDomainName request, StreamObserver<RpcDomainSample> responseObserver) {
|
||||||
|
String domainName = request.getDomainName();
|
||||||
|
if (domainName.isBlank()) {
|
||||||
|
responseObserver.onError(Status.INVALID_ARGUMENT
|
||||||
|
.withDescription("Invalid domain name")
|
||||||
|
.asRuntimeException());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
List<DomSampleDb.Sample> dbRecords = domSampleDb.getSamples(domainName);
|
||||||
|
|
||||||
|
for (var record : dbRecords) {
|
||||||
|
responseObserver.onNext(convertFullSample(record).build());
|
||||||
|
}
|
||||||
|
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error in getSample()", e);
|
||||||
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private RpcDomainSample.Builder convertFullSample(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
ByteString htmlZstd = ByteString.copyFrom(Zstd.compress(dbSample.sample().getBytes(StandardCharsets.UTF_8)));
|
||||||
|
|
||||||
|
var sampleBuilder = RpcDomainSample.newBuilder()
|
||||||
|
.setDomainName(dbSample.domain())
|
||||||
|
.setAcceptedPopover(dbSample.acceptedPopover())
|
||||||
|
.setHtmlSampleZstd(htmlZstd);
|
||||||
|
|
||||||
|
for (var req : dbSample.parseRequests()) {
|
||||||
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
|
.setUrl(req.uri().toString())
|
||||||
|
.setMethod(switch (req.method().toUpperCase())
|
||||||
|
{
|
||||||
|
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||||
|
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||||
|
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||||
|
})
|
||||||
|
.setTimestamp(req.timestamp());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sampleBuilder;
|
||||||
|
}
|
||||||
|
|
||||||
|
private RpcDomainSampleRequests.Builder convertRequestData(DomSampleDb.Sample dbSample) {
|
||||||
|
|
||||||
|
var sampleBuilder = RpcDomainSampleRequests.newBuilder()
|
||||||
|
.setDomainName(dbSample.domain());
|
||||||
|
|
||||||
|
for (var req : dbSample.parseRequests()) {
|
||||||
|
sampleBuilder.addOutgoingRequestsBuilder()
|
||||||
|
.setUrl(req.uri().toString())
|
||||||
|
.setMethod(switch (req.method().toUpperCase())
|
||||||
|
{
|
||||||
|
case "GET" -> RpcOutgoingRequest.RequestMethod.GET;
|
||||||
|
case "POST" -> RpcOutgoingRequest.RequestMethod.POST;
|
||||||
|
default -> RpcOutgoingRequest.RequestMethod.OTHER;
|
||||||
|
})
|
||||||
|
.setTimestamp(req.timestamp());
|
||||||
|
}
|
||||||
|
|
||||||
|
return sampleBuilder;
|
||||||
|
}
|
||||||
|
}
|
@@ -1,17 +1,28 @@
|
|||||||
package nu.marginalia.domsample.db;
|
package nu.marginalia.domsample.db;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
public class DomSampleDb implements AutoCloseable {
|
public class DomSampleDb implements AutoCloseable {
|
||||||
private static final String dbFileName = "dom-sample.db";
|
private static final String dbFileName = "dom-sample.db";
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomSampleDb.class);
|
||||||
|
|
||||||
public DomSampleDb() throws SQLException{
|
public DomSampleDb() throws SQLException{
|
||||||
this(WmsaHome.getDataPath().resolve(dbFileName));
|
this(WmsaHome.getDataPath().resolve(dbFileName));
|
||||||
@@ -88,7 +99,71 @@ public class DomSampleDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {}
|
public record Sample(String url, String domain, String sample, String requests, boolean acceptedPopover) {
|
||||||
|
|
||||||
|
public List<SampleRequest> parseRequests() {
|
||||||
|
List<SampleRequest> requests = new ArrayList<>();
|
||||||
|
|
||||||
|
// Request format is METHOD\tTIMESTAMP\tURI\n
|
||||||
|
|
||||||
|
for (var line : StringUtils.split(this.requests, '\n')) {
|
||||||
|
String[] parts = StringUtils.split(line, "\t", 3);
|
||||||
|
if (parts.length != 3) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
String method = parts[0];
|
||||||
|
long ts = Long.parseLong(parts[1]);
|
||||||
|
String linkUrl = parts[2];
|
||||||
|
|
||||||
|
URI uri = parseURI(linkUrl);
|
||||||
|
|
||||||
|
requests.add(new SampleRequest(method, ts, uri));
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.warn("Failed to parse requests", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return requests;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static URI parseURI(String uri) throws URISyntaxException {
|
||||||
|
try {
|
||||||
|
return new URI(uri);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) {
|
||||||
|
return new EdgeUrl(uri).asURI();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SampleRequest(String method, long timestamp, URI uri) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param consumer - consume the sample, return true to continue consumption
|
||||||
|
* @throws SQLException
|
||||||
|
*/
|
||||||
|
public void forEachSample(Predicate<Sample> consumer) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT url, domain, sample, requests, accepted_popover
|
||||||
|
FROM samples
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
var sample = new Sample(
|
||||||
|
rs.getString("url"),
|
||||||
|
rs.getString("domain"),
|
||||||
|
rs.getString("sample"),
|
||||||
|
rs.getString("requests"),
|
||||||
|
rs.getBoolean("accepted_popover")
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!consumer.test(sample)) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public List<Sample> getSamples(String domain) throws SQLException {
|
public List<Sample> getSamples(String domain) throws SQLException {
|
||||||
List<Sample> samples = new ArrayList<>();
|
List<Sample> samples = new ArrayList<>();
|
||||||
@@ -116,6 +191,21 @@ public class DomSampleDb implements AutoCloseable {
|
|||||||
return samples;
|
return samples;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasSample(String domain) throws SQLException {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT 1
|
||||||
|
FROM samples
|
||||||
|
WHERE domain = ?
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
return rs.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
public void saveSample(String domain, String url, String rawContent) throws SQLException {
|
||||||
var doc = Jsoup.parse(rawContent);
|
var doc = Jsoup.parse(rawContent);
|
||||||
|
|
||||||
|
@@ -1,66 +0,0 @@
|
|||||||
package nu.marginalia.rss.svc;
|
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.Semaphore;
|
|
||||||
|
|
||||||
/** Holds lock objects for each domain, to prevent multiple threads from
|
|
||||||
* crawling the same domain at the same time.
|
|
||||||
*/
|
|
||||||
public class DomainLocks {
|
|
||||||
// The locks are stored in a map, with the domain name as the key. This map will grow
|
|
||||||
// relatively big, but should be manageable since the number of domains is limited to
|
|
||||||
// a few hundred thousand typically.
|
|
||||||
private final Map<String, Semaphore> locks = new ConcurrentHashMap<>();
|
|
||||||
|
|
||||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
|
||||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
|
||||||
*/
|
|
||||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
|
||||||
return new DomainLock(domain.toString(),
|
|
||||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
|
||||||
}
|
|
||||||
|
|
||||||
private Semaphore defaultPermits(String topDomain) {
|
|
||||||
if (topDomain.equals("wordpress.com"))
|
|
||||||
return new Semaphore(16);
|
|
||||||
if (topDomain.equals("blogspot.com"))
|
|
||||||
return new Semaphore(8);
|
|
||||||
|
|
||||||
if (topDomain.equals("neocities.org"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
if (topDomain.equals("github.io"))
|
|
||||||
return new Semaphore(4);
|
|
||||||
|
|
||||||
if (topDomain.equals("substack.com")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
if (topDomain.endsWith(".edu")) {
|
|
||||||
return new Semaphore(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Semaphore(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
|
||||||
private final String domainName;
|
|
||||||
private final Semaphore semaphore;
|
|
||||||
|
|
||||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
|
||||||
this.domainName = domainName;
|
|
||||||
this.semaphore = semaphore;
|
|
||||||
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [await domain lock]");
|
|
||||||
semaphore.acquire();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() {
|
|
||||||
semaphore.release();
|
|
||||||
Thread.currentThread().setName("fetching:" + domainName + " [wrapping up]");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -5,6 +5,8 @@ import com.opencsv.CSVReader;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
|
import nu.marginalia.coordination.DomainCoordinator;
|
||||||
|
import nu.marginalia.coordination.DomainLock;
|
||||||
import nu.marginalia.executor.client.ExecutorClient;
|
import nu.marginalia.executor.client.ExecutorClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
@@ -18,19 +20,36 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
|
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||||
|
import org.apache.hc.client5.http.config.RequestConfig;
|
||||||
|
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
|
import org.apache.hc.core5.http.HeaderElement;
|
||||||
|
import org.apache.hc.core5.http.HeaderElements;
|
||||||
|
import org.apache.hc.core5.http.HttpResponse;
|
||||||
|
import org.apache.hc.core5.http.io.SocketConfig;
|
||||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
import org.apache.hc.core5.http.message.MessageSupport;
|
||||||
|
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||||
|
import org.apache.hc.core5.util.TimeValue;
|
||||||
|
import org.apache.hc.core5.util.Timeout;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpRequest;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.*;
|
import java.time.Instant;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.ZoneId;
|
||||||
|
import java.time.ZonedDateTime;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
@@ -51,12 +70,15 @@ public class FeedFetcherService {
|
|||||||
private final ServiceHeartbeat serviceHeartbeat;
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
private final ExecutorClient executorClient;
|
private final ExecutorClient executorClient;
|
||||||
|
|
||||||
private final DomainLocks domainLocks = new DomainLocks();
|
private final DomainCoordinator domainCoordinator;
|
||||||
|
|
||||||
|
private final HttpClient httpClient;
|
||||||
|
|
||||||
private volatile boolean updating;
|
private volatile boolean updating;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeedFetcherService(FeedDb feedDb,
|
public FeedFetcherService(FeedDb feedDb,
|
||||||
|
DomainCoordinator domainCoordinator,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
NodeConfigurationService nodeConfigurationService,
|
NodeConfigurationService nodeConfigurationService,
|
||||||
ServiceHeartbeat serviceHeartbeat,
|
ServiceHeartbeat serviceHeartbeat,
|
||||||
@@ -67,6 +89,84 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
this.domainCoordinator = domainCoordinator;
|
||||||
|
|
||||||
|
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||||
|
.setSocketTimeout(15, TimeUnit.SECONDS)
|
||||||
|
.setConnectTimeout(15, TimeUnit.SECONDS)
|
||||||
|
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
|
var connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||||
|
.setMaxConnPerRoute(2)
|
||||||
|
.setMaxConnTotal(50)
|
||||||
|
.setDefaultConnectionConfig(connectionConfig)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||||
|
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||||
|
.setSoTimeout(Timeout.ofSeconds(10))
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon(true).start(() -> {
|
||||||
|
try {
|
||||||
|
for (;;) {
|
||||||
|
TimeUnit.SECONDS.sleep(15);
|
||||||
|
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||||
|
.setCookieSpec(StandardCookieSpec.IGNORE)
|
||||||
|
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
httpClient = HttpClients.custom()
|
||||||
|
.setDefaultRequestConfig(defaultRequestConfig)
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setUserAgent(WmsaHome.getUserAgent().uaIdentifier())
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||||
|
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||||
|
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||||
|
//
|
||||||
|
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||||
|
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||||
|
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||||
|
|
||||||
|
while (it.hasNext()) {
|
||||||
|
final HeaderElement he = it.next();
|
||||||
|
final String param = he.getName();
|
||||||
|
final String value = he.getValue();
|
||||||
|
|
||||||
|
if (value == null)
|
||||||
|
continue;
|
||||||
|
if (!"timeout".equalsIgnoreCase(param))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
long timeout = Long.parseLong(value);
|
||||||
|
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||||
|
return TimeValue.ofSeconds(timeout);
|
||||||
|
} catch (final NumberFormatException ignore) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.build();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
@@ -82,13 +182,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
|
|
||||||
try (FeedDbWriter writer = feedDb.createWriter();
|
try (FeedDbWriter writer = feedDb.createWriter();
|
||||||
HttpClient client = HttpClient.newBuilder()
|
ExecutorService fetchExecutor = Executors.newVirtualThreadPerTaskExecutor();
|
||||||
.connectTimeout(Duration.ofSeconds(15))
|
|
||||||
.executor(Executors.newCachedThreadPool())
|
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
|
||||||
.version(HttpClient.Version.HTTP_2)
|
|
||||||
.build();
|
|
||||||
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
|
||||||
FeedJournal feedJournal = FeedJournal.create();
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
@@ -132,8 +226,9 @@ public class FeedFetcherService {
|
|||||||
};
|
};
|
||||||
|
|
||||||
FetchResult feedData;
|
FetchResult feedData;
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
try (DomainLock domainLock = domainCoordinator.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
feedData = fetchFeedData(feed, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
|
TimeUnit.SECONDS.sleep(1); // Sleep before we yield the lock to avoid hammering the server from multiple processes
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
feedData = new FetchResult.TransientError();
|
feedData = new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
@@ -212,7 +307,6 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
HttpClient client,
|
|
||||||
ExecutorService executorService,
|
ExecutorService executorService,
|
||||||
@Nullable String ifModifiedSinceDate,
|
@Nullable String ifModifiedSinceDate,
|
||||||
@Nullable String ifNoneMatchTag)
|
@Nullable String ifNoneMatchTag)
|
||||||
@@ -220,59 +314,63 @@ public class FeedFetcherService {
|
|||||||
try {
|
try {
|
||||||
URI uri = new URI(feed.feedUrl());
|
URI uri = new URI(feed.feedUrl());
|
||||||
|
|
||||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
var requestBuilder = ClassicRequestBuilder.get(uri)
|
||||||
.GET()
|
.setHeader("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||||
.uri(uri)
|
.setHeader("Accept-Encoding", "gzip")
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
.setHeader("Accept", "text/*, */*;q=0.9");
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
|
||||||
.timeout(Duration.ofSeconds(15))
|
|
||||||
;
|
|
||||||
|
|
||||||
// Set the If-Modified-Since or If-None-Match headers if we have them
|
// Set the If-Modified-Since or If-None-Match headers if we have them
|
||||||
// though since there are certain idiosyncrasies in server implementations,
|
// though since there are certain idiosyncrasies in server implementations,
|
||||||
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
// we avoid setting both at the same time as that may turn a 304 into a 200.
|
||||||
if (ifNoneMatchTag != null) {
|
if (ifNoneMatchTag != null) {
|
||||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
requestBuilder.addHeader("If-None-Match", ifNoneMatchTag);
|
||||||
} else if (ifModifiedSinceDate != null) {
|
} else if (ifModifiedSinceDate != null) {
|
||||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
requestBuilder.addHeader("If-Modified-Since", ifModifiedSinceDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return httpClient.execute(requestBuilder.build(), rsp -> {
|
||||||
|
try {
|
||||||
|
logger.info("Code: {}, URL: {}", rsp.getCode(), uri);
|
||||||
|
|
||||||
HttpRequest getRequest = requestBuilder.build();
|
switch (rsp.getCode()) {
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
|
|
||||||
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
|
||||||
* its support for timeouts only applies to the time until response starts to be received,
|
|
||||||
* and does not catch the case when the server starts to send data but then hangs.
|
|
||||||
*/
|
|
||||||
HttpResponse<byte[]> rs = executorService.submit(
|
|
||||||
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
|
||||||
.get(15, TimeUnit.SECONDS);
|
|
||||||
|
|
||||||
if (rs.statusCode() == 429) { // Too Many Requests
|
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
|
||||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
|
||||||
|
|
||||||
return switch (rs.statusCode()) {
|
|
||||||
case 200 -> {
|
case 200 -> {
|
||||||
byte[] responseData = getResponseData(rs);
|
if (rsp.getEntity() == null) {
|
||||||
|
return new FetchResult.TransientError(); // No content to read, treat as transient error
|
||||||
|
}
|
||||||
|
byte[] responseData = EntityUtils.toByteArray(rsp.getEntity());
|
||||||
|
|
||||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
// Decode the response body based on the Content-Type header
|
||||||
|
Header contentTypeHeader = rsp.getFirstHeader("Content-Type");
|
||||||
|
if (contentTypeHeader == null) {
|
||||||
|
return new FetchResult.TransientError();
|
||||||
|
}
|
||||||
|
String contentType = contentTypeHeader.getValue();
|
||||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||||
|
|
||||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
// Grab the ETag header if it exists
|
||||||
|
Header etagHeader = rsp.getFirstHeader("ETag");
|
||||||
|
String newEtagValue = etagHeader == null ? null : etagHeader.getValue();
|
||||||
|
|
||||||
|
return new FetchResult.Success(bodyText, newEtagValue);
|
||||||
}
|
}
|
||||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
case 304 -> {
|
||||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
return new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||||
default -> new FetchResult.TransientError(); // we try again later
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
case 404 -> {
|
||||||
|
return new FetchResult.PermanentError(); // never try again
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
return new FetchResult.TransientError(); // we try again later
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return new FetchResult.PermanentError(); // treat as permanent error
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
EntityUtils.consumeQuietly(rsp.getEntity());
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.debug("Error fetching feed", ex);
|
logger.debug("Error fetching feed", ex);
|
||||||
@@ -281,19 +379,6 @@ public class FeedFetcherService {
|
|||||||
return new FetchResult.TransientError();
|
return new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
|
||||||
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
|
||||||
|
|
||||||
if ("gzip".equals(encoding)) {
|
|
||||||
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
|
||||||
return stream.readAllBytes();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return response.body();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public sealed interface FetchResult {
|
public sealed interface FetchResult {
|
||||||
record Success(String value, String etag) implements FetchResult {}
|
record Success(String value, String etag) implements FetchResult {}
|
||||||
record NotModified() implements FetchResult {}
|
record NotModified() implements FetchResult {}
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.rss.svc;
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import nu.marginalia.api.feeds.*;
|
import nu.marginalia.api.feeds.*;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
@@ -69,7 +70,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
@Override
|
@Override
|
||||||
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
|
public void getFeedDataHash(Empty request, StreamObserver<RpcFeedDataHash> responseObserver) {
|
||||||
if (!feedDb.isEnabled()) {
|
if (!feedDb.isEnabled()) {
|
||||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,7 +81,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
logger.error("Error getting feed data hash", e);
|
logger.error("Error getting feed data hash", e);
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,7 +102,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
logger.error("Error getting updated links", e);
|
logger.error("Error getting updated links", e);
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,13 +110,13 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
public void getFeed(RpcDomainId request,
|
public void getFeed(RpcDomainId request,
|
||||||
StreamObserver<RpcFeed> responseObserver) {
|
StreamObserver<RpcFeed> responseObserver) {
|
||||||
if (!feedDb.isEnabled()) {
|
if (!feedDb.isEnabled()) {
|
||||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
responseObserver.onError(Status.INTERNAL.withDescription("Feed database is disabled on this node").asRuntimeException());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<EdgeDomain> domainName = domainQueries.getDomain(request.getDomainId());
|
Optional<EdgeDomain> domainName = domainQueries.getDomain(request.getDomainId());
|
||||||
if (domainName.isEmpty()) {
|
if (domainName.isEmpty()) {
|
||||||
responseObserver.onError(new IllegalArgumentException("Domain not found"));
|
responseObserver.onError(Status.NOT_FOUND.withDescription("Domain not found").asRuntimeException());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -5,6 +5,8 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.coordination.DomainCoordinator;
|
||||||
|
import nu.marginalia.coordination.LocalDomainCoordinator;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.rss.db.FeedDb;
|
import nu.marginalia.rss.db.FeedDb;
|
||||||
import nu.marginalia.rss.model.FeedItems;
|
import nu.marginalia.rss.model.FeedItems;
|
||||||
@@ -82,9 +84,10 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
|
bind(DomainCoordinator.class).to(LocalDomainCoordinator.class);
|
||||||
bind(HikariDataSource.class).toInstance(dataSource);
|
bind(HikariDataSource.class).toInstance(dataSource);
|
||||||
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
bind(ServiceRegistryIf.class).toInstance(Mockito.mock(ServiceRegistryIf.class));
|
||||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Executor, 1, "", "", 0, UUID.randomUUID()));
|
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(ServiceId.Index, 1, "", "", 0, UUID.randomUUID()));
|
||||||
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
|
bind(Integer.class).annotatedWith(Names.named("wmsa-system-node")).toInstance(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -26,7 +26,9 @@ public class MathClient {
|
|||||||
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(MathClient.class);
|
||||||
|
|
||||||
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
private final GrpcSingleNodeChannelPool<MathApiGrpc.MathApiBlockingStub> channelPool;
|
||||||
private final ExecutorService executor = Executors.newWorkStealingPool(8);
|
|
||||||
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
|
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newWorkStealingPool(8);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MathClient(GrpcChannelPoolFactory factory) {
|
public MathClient(GrpcChannelPoolFactory factory) {
|
||||||
|
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
43
code/functions/nsfw-domain-filter/build.gradle
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.fastutil
|
||||||
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation libs.commons.codec
|
||||||
|
testImplementation project(':code:common:service')
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
}
|
@@ -0,0 +1,192 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class NsfwDomainFilter {
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final List<String> dangerLists;
|
||||||
|
private final List<String> smutLists;
|
||||||
|
|
||||||
|
private volatile IntOpenHashSet blockedDomainIdsTier1 = new IntOpenHashSet();
|
||||||
|
private volatile IntOpenHashSet blockedDomainIdsTier2 = new IntOpenHashSet();
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(NsfwDomainFilter.class);
|
||||||
|
|
||||||
|
public static final int NSFW_DISABLE = 0;
|
||||||
|
public static final int NSFW_BLOCK_DANGER = 1;
|
||||||
|
public static final int NSFW_BLOCK_SMUT = 2;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NsfwDomainFilter(HikariDataSource dataSource,
|
||||||
|
@Named("nsfw.dangerLists") List<String> dangerLists,
|
||||||
|
@Named("nsfw.smutLists") List<String> smutLists
|
||||||
|
) {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
this.dangerLists = dangerLists;
|
||||||
|
this.smutLists = smutLists;
|
||||||
|
|
||||||
|
Thread.ofPlatform().daemon().name("NsfwDomainFilterSync").start(() -> {
|
||||||
|
while (true) {
|
||||||
|
sync();
|
||||||
|
try {
|
||||||
|
TimeUnit.HOURS.sleep(1);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
break; // Exit the loop if interrupted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBlocked(int domainId, int tier) {
|
||||||
|
if (tier == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (tier >= 1 && blockedDomainIdsTier1.contains(domainId))
|
||||||
|
return true;
|
||||||
|
if (tier >= 2 && blockedDomainIdsTier2.contains(domainId))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized void sync() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT ID, TIER FROM NSFW_DOMAINS")
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
IntOpenHashSet tier1 = new IntOpenHashSet();
|
||||||
|
IntOpenHashSet tier2 = new IntOpenHashSet();
|
||||||
|
|
||||||
|
while (rs.next()) {
|
||||||
|
int domainId = rs.getInt("ID");
|
||||||
|
int tier = rs.getInt("TIER");
|
||||||
|
|
||||||
|
switch (tier) {
|
||||||
|
case 1 -> tier1.add(domainId);
|
||||||
|
case 2 -> tier2.add(domainId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.blockedDomainIdsTier1 = tier1;
|
||||||
|
this.blockedDomainIdsTier2 = tier2;
|
||||||
|
|
||||||
|
logger.info("NSFW domain filter synced: {} tier 1, {} tier 2", tier1.size(), tier2.size());
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to sync NSFW domain filter", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void fetchLists() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
HttpClient client = HttpClient.newBuilder()
|
||||||
|
.followRedirects(HttpClient.Redirect.ALWAYS)
|
||||||
|
.build();
|
||||||
|
var stmt = conn.createStatement();
|
||||||
|
var insertStmt = conn.prepareStatement("INSERT IGNORE INTO NSFW_DOMAINS_TMP (ID, TIER) SELECT ID, ? FROM EC_DOMAIN WHERE DOMAIN_NAME = ?")) {
|
||||||
|
|
||||||
|
stmt.execute("DROP TABLE IF EXISTS NSFW_DOMAINS_TMP");
|
||||||
|
stmt.execute("CREATE TABLE NSFW_DOMAINS_TMP LIKE NSFW_DOMAINS");
|
||||||
|
|
||||||
|
List<String> combinedDangerList = new ArrayList<>(10_000);
|
||||||
|
for (var dangerListUrl : dangerLists) {
|
||||||
|
combinedDangerList.addAll(fetchList(client, dangerListUrl));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : combinedDangerList) {
|
||||||
|
insertStmt.setInt(1, NSFW_BLOCK_DANGER);
|
||||||
|
insertStmt.setString(2, domain);
|
||||||
|
insertStmt.execute();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> combinedSmutList = new ArrayList<>(10_000);
|
||||||
|
for (var smutListUrl : smutLists) {
|
||||||
|
combinedSmutList.addAll(fetchList(client, smutListUrl));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String domain : combinedSmutList) {
|
||||||
|
insertStmt.setInt(1, NSFW_BLOCK_SMUT);
|
||||||
|
insertStmt.setString(2, domain);
|
||||||
|
insertStmt.addBatch();
|
||||||
|
insertStmt.execute();
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt.execute("""
|
||||||
|
DROP TABLE IF EXISTS NSFW_DOMAINS
|
||||||
|
""");
|
||||||
|
stmt.execute("""
|
||||||
|
RENAME TABLE NSFW_DOMAINS_TMP TO NSFW_DOMAINS
|
||||||
|
""");
|
||||||
|
sync();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch NSFW domain lists", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> fetchList(HttpClient client, String url) {
|
||||||
|
|
||||||
|
logger.info("Fetching NSFW domain list from {}", url);
|
||||||
|
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(java.net.URI.create(url))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (url.endsWith(".gz")) {
|
||||||
|
var response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
|
byte[] body = response.body();
|
||||||
|
|
||||||
|
try (var reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new ByteArrayInputStream(body))))) {
|
||||||
|
return reader.lines()
|
||||||
|
.filter(StringUtils::isNotEmpty)
|
||||||
|
.toList();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error reading GZIP response from {}", url, e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
if (response.statusCode() == 200) {
|
||||||
|
|
||||||
|
return Arrays.stream(StringUtils.split(response.body(), "\n"))
|
||||||
|
.filter(StringUtils::isNotEmpty)
|
||||||
|
.toList();
|
||||||
|
} else {
|
||||||
|
logger.warn("Failed to fetch list from {}: HTTP {}", url, response.statusCode());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error fetching NSFW domain list from {}", url, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,30 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Provides;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class NsfwFilterModule extends AbstractModule {
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.dangerLists")
|
||||||
|
public List<String> nsfwDomainLists1() {
|
||||||
|
return List.of(
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/cryptojacking/domains",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/malware/domains",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/phishing/domains"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.smutLists")
|
||||||
|
public List<String> nsfwDomainLists2() {
|
||||||
|
return List.of(
|
||||||
|
"https://github.com/olbat/ut1-blacklists/raw/refs/heads/master/blacklists/adult/domains.gz",
|
||||||
|
"https://raw.githubusercontent.com/olbat/ut1-blacklists/refs/heads/master/blacklists/gambling/domains"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void configure() {}
|
||||||
|
}
|
@@ -0,0 +1,108 @@
|
|||||||
|
package nu.marginalia.nsfw;
|
||||||
|
|
||||||
|
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Provides;
|
||||||
|
import com.zaxxer.hikari.HikariConfig;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import jakarta.inject.Named;
|
||||||
|
import nu.marginalia.test.TestMigrationLoader;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.testcontainers.containers.MariaDBContainer;
|
||||||
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
@Testcontainers
|
||||||
|
class NsfwDomainFilterTest extends AbstractModule {
|
||||||
|
|
||||||
|
@Container
|
||||||
|
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
|
||||||
|
.withDatabaseName("WMSA_prod")
|
||||||
|
.withUsername("wmsa")
|
||||||
|
.withPassword("wmsa")
|
||||||
|
.withNetworkAliases("mariadb");
|
||||||
|
|
||||||
|
static HikariDataSource dataSource;
|
||||||
|
static Path tempDir;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpDb() throws IOException {
|
||||||
|
tempDir = Files.createTempDirectory(NsfwDomainFilterTest.class.getSimpleName());
|
||||||
|
|
||||||
|
System.setProperty("system.homePath", tempDir.toString());
|
||||||
|
|
||||||
|
HikariConfig config = new HikariConfig();
|
||||||
|
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
|
||||||
|
config.setUsername("wmsa");
|
||||||
|
config.setPassword("wmsa");
|
||||||
|
|
||||||
|
dataSource = new HikariDataSource(config);
|
||||||
|
|
||||||
|
TestMigrationLoader.flywayMigration(dataSource);
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("INSERT INTO EC_DOMAIN (DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) VALUES (?, ?, 1)")
|
||||||
|
) {
|
||||||
|
|
||||||
|
// Ensure the database is ready
|
||||||
|
conn.createStatement().execute("SELECT 1");
|
||||||
|
|
||||||
|
stmt.setString(1, "www.google.com");
|
||||||
|
stmt.setString(2, "google.com");
|
||||||
|
stmt.executeUpdate();
|
||||||
|
stmt.setString(1, "www.bing.com");
|
||||||
|
stmt.setString(2, "bing.com");
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("Failed to connect to the database", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.dangerLists")
|
||||||
|
public List<String> nsfwDomainLists1() {
|
||||||
|
return List.of(
|
||||||
|
"https://downloads.marginalia.nu/test/list1"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Provides
|
||||||
|
@Named("nsfw.smutLists")
|
||||||
|
public List<String> nsfwDomainLists2() {
|
||||||
|
return List.of(
|
||||||
|
"https://downloads.marginalia.nu/test/list2.gz"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void configure() {
|
||||||
|
bind(HikariDataSource.class).toInstance(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() {
|
||||||
|
var filter = Guice
|
||||||
|
.createInjector(this)
|
||||||
|
.getInstance(NsfwDomainFilter.class);
|
||||||
|
|
||||||
|
filter.fetchLists();
|
||||||
|
|
||||||
|
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||||
|
assertTrue(filter.isBlocked(1, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||||
|
assertFalse(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_DANGER));
|
||||||
|
assertTrue(filter.isBlocked(2, NsfwDomainFilter.NSFW_BLOCK_SMUT));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,9 +1,6 @@
|
|||||||
package nu.marginalia.api.searchquery;
|
package nu.marginalia.api.searchquery;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
import nu.marginalia.api.searchquery.model.query.*;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
@@ -32,6 +29,8 @@ public class QueryProtobufCodec {
|
|||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(request.getHumanQuery());
|
builder.setHumanQuery(request.getHumanQuery());
|
||||||
|
|
||||||
|
builder.setNsfwFilterTierValue(request.getNsfwFilterTierValue());
|
||||||
|
|
||||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||||
@@ -78,6 +77,8 @@ public class QueryProtobufCodec {
|
|||||||
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
builder.setSearchSetIdentifier(query.specs.searchSetIdentifier);
|
||||||
builder.setHumanQuery(humanQuery);
|
builder.setHumanQuery(humanQuery);
|
||||||
|
|
||||||
|
builder.setNsfwFilterTier(RpcIndexQuery.NSFW_FILTER_TIER.DANGER);
|
||||||
|
|
||||||
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
builder.setQuality(IndexProtobufCodec.convertSpecLimit(query.specs.quality));
|
||||||
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
builder.setYear(IndexProtobufCodec.convertSpecLimit(query.specs.year));
|
||||||
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
builder.setSize(IndexProtobufCodec.convertSpecLimit(query.specs.size));
|
||||||
@@ -112,6 +113,7 @@ public class QueryProtobufCodec {
|
|||||||
request.getSearchSetIdentifier(),
|
request.getSearchSetIdentifier(),
|
||||||
QueryStrategy.valueOf(request.getQueryStrategy()),
|
QueryStrategy.valueOf(request.getQueryStrategy()),
|
||||||
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
RpcTemporalBias.Bias.valueOf(request.getTemporalBias().getBias().name()),
|
||||||
|
NsfwFilterTier.fromCodedValue(request.getNsfwFilterTierValue()),
|
||||||
request.getPagination().getPage()
|
request.getPagination().getPage()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -302,7 +304,6 @@ public class QueryProtobufCodec {
|
|||||||
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
IndexProtobufCodec.convertRpcQuery(specs.getQuery()),
|
||||||
specs.getDomainsList(),
|
specs.getDomainsList(),
|
||||||
specs.getSearchSetIdentifier(),
|
specs.getSearchSetIdentifier(),
|
||||||
specs.getHumanQuery(),
|
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
IndexProtobufCodec.convertSpecLimit(specs.getQuality()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
IndexProtobufCodec.convertSpecLimit(specs.getYear()),
|
||||||
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
IndexProtobufCodec.convertSpecLimit(specs.getSize()),
|
||||||
@@ -327,6 +328,7 @@ public class QueryProtobufCodec {
|
|||||||
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
.setRank(IndexProtobufCodec.convertSpecLimit(params.rank()))
|
||||||
.setSearchSetIdentifier(params.identifier())
|
.setSearchSetIdentifier(params.identifier())
|
||||||
.setQueryStrategy(params.queryStrategy().name())
|
.setQueryStrategy(params.queryStrategy().name())
|
||||||
|
.setNsfwFilterTierValue(params.filterTier().getCodedValue())
|
||||||
.setTemporalBias(RpcTemporalBias.newBuilder()
|
.setTemporalBias(RpcTemporalBias.newBuilder()
|
||||||
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
.setBias(RpcTemporalBias.Bias.valueOf(params.temporalBias().name()))
|
||||||
.build())
|
.build())
|
||||||
|
@@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.query;
|
||||||
|
|
||||||
|
public enum NsfwFilterTier {
|
||||||
|
OFF(0),
|
||||||
|
DANGER(1),
|
||||||
|
PORN_AND_GAMBLING(2);
|
||||||
|
|
||||||
|
private final int codedValue; // same as ordinal() for now, but can be changed later if needed
|
||||||
|
|
||||||
|
NsfwFilterTier(int codedValue) {
|
||||||
|
this.codedValue = codedValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NsfwFilterTier fromCodedValue(int codedValue) {
|
||||||
|
for (NsfwFilterTier tier : NsfwFilterTier.values()) {
|
||||||
|
if (tier.codedValue == codedValue) {
|
||||||
|
return tier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Invalid coded value for NsfwFilterTirer: " + codedValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCodedValue() {
|
||||||
|
return codedValue;
|
||||||
|
}
|
||||||
|
}
|
@@ -25,10 +25,11 @@ public record QueryParams(
|
|||||||
String identifier,
|
String identifier,
|
||||||
QueryStrategy queryStrategy,
|
QueryStrategy queryStrategy,
|
||||||
RpcTemporalBias.Bias temporalBias,
|
RpcTemporalBias.Bias temporalBias,
|
||||||
|
NsfwFilterTier filterTier,
|
||||||
int page
|
int page
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
public QueryParams(String query, RpcQueryLimits limits, String identifier) {
|
public QueryParams(String query, RpcQueryLimits limits, String identifier, NsfwFilterTier filterTier) {
|
||||||
this(query, null,
|
this(query, null,
|
||||||
List.of(),
|
List.of(),
|
||||||
List.of(),
|
List.of(),
|
||||||
@@ -43,6 +44,7 @@ public record QueryParams(
|
|||||||
identifier,
|
identifier,
|
||||||
QueryStrategy.AUTO,
|
QueryStrategy.AUTO,
|
||||||
RpcTemporalBias.Bias.NONE,
|
RpcTemporalBias.Bias.NONE,
|
||||||
|
filterTier,
|
||||||
1 // page
|
1 // page
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@@ -18,8 +18,6 @@ public class SearchSpecification {
|
|||||||
|
|
||||||
public String searchSetIdentifier;
|
public String searchSetIdentifier;
|
||||||
|
|
||||||
public final String humanQuery;
|
|
||||||
|
|
||||||
public SpecificationLimit quality;
|
public SpecificationLimit quality;
|
||||||
public SpecificationLimit year;
|
public SpecificationLimit year;
|
||||||
public SpecificationLimit size;
|
public SpecificationLimit size;
|
||||||
@@ -35,7 +33,6 @@ public class SearchSpecification {
|
|||||||
public SearchSpecification(SearchQuery query,
|
public SearchSpecification(SearchQuery query,
|
||||||
List<Integer> domains,
|
List<Integer> domains,
|
||||||
String searchSetIdentifier,
|
String searchSetIdentifier,
|
||||||
String humanQuery,
|
|
||||||
SpecificationLimit quality,
|
SpecificationLimit quality,
|
||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
@@ -47,7 +44,6 @@ public class SearchSpecification {
|
|||||||
this.query = query;
|
this.query = query;
|
||||||
this.domains = domains;
|
this.domains = domains;
|
||||||
this.searchSetIdentifier = searchSetIdentifier;
|
this.searchSetIdentifier = searchSetIdentifier;
|
||||||
this.humanQuery = humanQuery;
|
|
||||||
this.quality = quality;
|
this.quality = quality;
|
||||||
this.year = year;
|
this.year = year;
|
||||||
this.size = size;
|
this.size = size;
|
||||||
@@ -73,10 +69,6 @@ public class SearchSpecification {
|
|||||||
return this.searchSetIdentifier;
|
return this.searchSetIdentifier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getHumanQuery() {
|
|
||||||
return this.humanQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SpecificationLimit getQuality() {
|
public SpecificationLimit getQuality() {
|
||||||
return this.quality;
|
return this.quality;
|
||||||
}
|
}
|
||||||
@@ -106,14 +98,13 @@ public class SearchSpecification {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class SearchSpecificationBuilder {
|
public static class SearchSpecificationBuilder {
|
||||||
private SearchQuery query;
|
private SearchQuery query;
|
||||||
private List<Integer> domains;
|
private List<Integer> domains;
|
||||||
private String searchSetIdentifier;
|
private String searchSetIdentifier;
|
||||||
private String humanQuery;
|
|
||||||
private SpecificationLimit quality$value;
|
private SpecificationLimit quality$value;
|
||||||
private boolean quality$set;
|
private boolean quality$set;
|
||||||
private SpecificationLimit year$value;
|
private SpecificationLimit year$value;
|
||||||
@@ -144,11 +135,6 @@ public class SearchSpecification {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSpecificationBuilder humanQuery(String humanQuery) {
|
|
||||||
this.humanQuery = humanQuery;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
||||||
this.quality$value = quality;
|
this.quality$value = quality;
|
||||||
this.quality$set = true;
|
this.quality$set = true;
|
||||||
@@ -205,11 +191,7 @@ public class SearchSpecification {
|
|||||||
if (!this.rank$set) {
|
if (!this.rank$set) {
|
||||||
rank$value = SpecificationLimit.none();
|
rank$value = SpecificationLimit.none();
|
||||||
}
|
}
|
||||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,56 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results;
|
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
|
||||||
|
|
||||||
import java.util.BitSet;
|
|
||||||
|
|
||||||
public class ResultRankingContext {
|
|
||||||
private final int docCount;
|
|
||||||
public final RpcResultRankingParameters params;
|
|
||||||
|
|
||||||
|
|
||||||
public final BitSet regularMask;
|
|
||||||
public final BitSet ngramsMask;
|
|
||||||
|
|
||||||
/** CqDataInt associated with frequency information of the terms in the query
|
|
||||||
* in the full index. The dataset is indexed by the compiled query. */
|
|
||||||
public final CqDataInt fullCounts;
|
|
||||||
|
|
||||||
/** CqDataInt associated with frequency information of the terms in the query
|
|
||||||
* in the full index. The dataset is indexed by the compiled query. */
|
|
||||||
public final CqDataInt priorityCounts;
|
|
||||||
|
|
||||||
public ResultRankingContext(int docCount,
|
|
||||||
RpcResultRankingParameters params,
|
|
||||||
BitSet ngramsMask,
|
|
||||||
BitSet regularMask,
|
|
||||||
CqDataInt fullCounts,
|
|
||||||
CqDataInt prioCounts)
|
|
||||||
{
|
|
||||||
this.docCount = docCount;
|
|
||||||
this.params = params;
|
|
||||||
|
|
||||||
this.ngramsMask = ngramsMask;
|
|
||||||
this.regularMask = regularMask;
|
|
||||||
|
|
||||||
this.fullCounts = fullCounts;
|
|
||||||
this.priorityCounts = prioCounts;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int termFreqDocCount() {
|
|
||||||
return docCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "ResultRankingContext{" +
|
|
||||||
"docCount=" + docCount +
|
|
||||||
", params=" + params +
|
|
||||||
", regularMask=" + regularMask +
|
|
||||||
", ngramsMask=" + ngramsMask +
|
|
||||||
", fullCounts=" + fullCounts +
|
|
||||||
", priorityCounts=" + priorityCounts +
|
|
||||||
'}';
|
|
||||||
}
|
|
||||||
}
|
|
@@ -32,6 +32,14 @@ message RpcQsQuery {
|
|||||||
RpcTemporalBias temporalBias = 16;
|
RpcTemporalBias temporalBias = 16;
|
||||||
|
|
||||||
RpcQsQueryPagination pagination = 17;
|
RpcQsQueryPagination pagination = 17;
|
||||||
|
|
||||||
|
NSFW_FILTER_TIER nsfwFilterTier = 18;
|
||||||
|
|
||||||
|
enum NSFW_FILTER_TIER {
|
||||||
|
NONE = 0;
|
||||||
|
DANGER = 1;
|
||||||
|
PORN_AND_GAMBLING = 2;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Query service query response */
|
/* Query service query response */
|
||||||
@@ -78,8 +86,17 @@ message RpcIndexQuery {
|
|||||||
RpcQueryLimits queryLimits = 10;
|
RpcQueryLimits queryLimits = 10;
|
||||||
string queryStrategy = 11; // Named query configuration
|
string queryStrategy = 11; // Named query configuration
|
||||||
RpcResultRankingParameters parameters = 12;
|
RpcResultRankingParameters parameters = 12;
|
||||||
|
|
||||||
|
NSFW_FILTER_TIER nsfwFilterTier = 13;
|
||||||
|
|
||||||
|
enum NSFW_FILTER_TIER {
|
||||||
|
NONE = 0;
|
||||||
|
DANGER = 1;
|
||||||
|
PORN_AND_GAMBLING = 2;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* A tagged union encoding some limit on a field */
|
/* A tagged union encoding some limit on a field */
|
||||||
message RpcSpecLimit {
|
message RpcSpecLimit {
|
||||||
int32 value = 1;
|
int32 value = 1;
|
||||||
|
@@ -19,6 +19,7 @@ dependencies {
|
|||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:functions:search-query:api')
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
|
@@ -34,8 +34,6 @@ public class QueryFactory {
|
|||||||
this.queryExpansion = queryExpansion;
|
this.queryExpansion = queryExpansion;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public ProcessedQuery createQuery(QueryParams params,
|
public ProcessedQuery createQuery(QueryParams params,
|
||||||
@Nullable RpcResultRankingParameters rankingParams) {
|
@Nullable RpcResultRankingParameters rankingParams) {
|
||||||
final var query = params.humanQuery();
|
final var query = params.humanQuery();
|
||||||
@@ -153,7 +151,6 @@ public class QueryFactory {
|
|||||||
|
|
||||||
var specsBuilder = SearchSpecification.builder()
|
var specsBuilder = SearchSpecification.builder()
|
||||||
.query(queryBuilder.build())
|
.query(queryBuilder.build())
|
||||||
.humanQuery(query)
|
|
||||||
.quality(qualityLimit)
|
.quality(qualityLimit)
|
||||||
.year(year)
|
.year(year)
|
||||||
.size(size)
|
.size(size)
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.functions.searchquery;
|
|||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.grpc.Status;
|
||||||
import io.grpc.stub.StreamObserver;
|
import io.grpc.stub.StreamObserver;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
import nu.marginalia.api.searchquery.*;
|
import nu.marginalia.api.searchquery.*;
|
||||||
@@ -11,6 +12,7 @@ import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||||
import nu.marginalia.index.api.IndexClient;
|
import nu.marginalia.index.api.IndexClient;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
import nu.marginalia.service.server.DiscoverableService;
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -34,13 +36,16 @@ public class QueryGRPCService
|
|||||||
|
|
||||||
|
|
||||||
private final QueryFactory queryFactory;
|
private final QueryFactory queryFactory;
|
||||||
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
private final IndexClient indexClient;
|
private final IndexClient indexClient;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryGRPCService(QueryFactory queryFactory,
|
public QueryGRPCService(QueryFactory queryFactory,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter,
|
||||||
IndexClient indexClient)
|
IndexClient indexClient)
|
||||||
{
|
{
|
||||||
this.queryFactory = queryFactory;
|
this.queryFactory = queryFactory;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
this.indexClient = indexClient;
|
this.indexClient = indexClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -89,7 +94,7 @@ public class QueryGRPCService
|
|||||||
});
|
});
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Exception", e);
|
logger.error("Exception", e);
|
||||||
responseObserver.onError(e);
|
responseObserver.onError(Status.INTERNAL.withCause(e).asRuntimeException());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.query.svc;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||||
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
import nu.marginalia.api.searchquery.RpcTemporalBias;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.NsfwFilterTier;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
@@ -58,6 +59,7 @@ public class QueryFactoryTest {
|
|||||||
"NONE",
|
"NONE",
|
||||||
QueryStrategy.AUTO,
|
QueryStrategy.AUTO,
|
||||||
RpcTemporalBias.Bias.NONE,
|
RpcTemporalBias.Bias.NONE,
|
||||||
|
NsfwFilterTier.OFF,
|
||||||
0), null).specs;
|
0), null).specs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -239,7 +241,6 @@ public class QueryFactoryTest {
|
|||||||
|
|
||||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" bob "));
|
||||||
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
Assertions.assertFalse(subquery.query.compiledQuery.contains(" bob's "));
|
||||||
Assertions.assertEquals("\"bob's cars\"", subquery.humanQuery);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@@ -17,6 +17,7 @@ dependencies {
|
|||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:functions:nsfw-domain-filter')
|
||||||
implementation project(':code:functions:search-query:api')
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@@ -2,11 +2,13 @@ package nu.marginalia.index.api;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Counter;
|
||||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
import nu.marginalia.db.DomainBlacklistImpl;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.nsfw.NsfwDomainFilter;
|
||||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
@@ -28,14 +30,28 @@ public class IndexClient {
|
|||||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||||
private final DomainBlacklistImpl blacklist;
|
private final DomainBlacklistImpl blacklist;
|
||||||
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
private final NsfwDomainFilter nsfwDomainFilter;
|
||||||
|
|
||||||
|
Counter wmsa_index_query_count = Counter.build()
|
||||||
|
.name("wmsa_nsfw_filter_result_count")
|
||||||
|
.labelNames("tier")
|
||||||
|
.help("Count of results filtered by NSFW tier")
|
||||||
|
.register();
|
||||||
|
|
||||||
|
|
||||||
|
private static final boolean useLoom = Boolean.getBoolean("system.experimentalUseLoom");
|
||||||
|
private static final ExecutorService executor = useLoom ? Executors.newVirtualThreadPerTaskExecutor() : Executors.newCachedThreadPool();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
public IndexClient(GrpcChannelPoolFactory channelPoolFactory,
|
||||||
|
DomainBlacklistImpl blacklist,
|
||||||
|
NsfwDomainFilter nsfwDomainFilter
|
||||||
|
) {
|
||||||
this.channelPool = channelPoolFactory.createMulti(
|
this.channelPool = channelPoolFactory.createMulti(
|
||||||
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()),
|
||||||
IndexApiGrpc::newBlockingStub);
|
IndexApiGrpc::newBlockingStub);
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
|
this.nsfwDomainFilter = nsfwDomainFilter;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Comparator<RpcDecoratedResultItem> comparator =
|
private static final Comparator<RpcDecoratedResultItem> comparator =
|
||||||
@@ -52,7 +68,7 @@ public class IndexClient {
|
|||||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||||
|
|
||||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||||
|
int filterTier = indexRequest.getNsfwFilterTierValue();
|
||||||
AtomicInteger totalNumResults = new AtomicInteger(0);
|
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||||
|
|
||||||
List<RpcDecoratedResultItem> results =
|
List<RpcDecoratedResultItem> results =
|
||||||
@@ -74,7 +90,7 @@ public class IndexClient {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
.flatMap(List::stream)
|
.flatMap(List::stream)
|
||||||
.filter(item -> !isBlacklisted(item))
|
.filter(item -> !isBlacklisted(item, filterTier))
|
||||||
.sorted(comparator)
|
.sorted(comparator)
|
||||||
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||||
.limit(pagination.pageSize)
|
.limit(pagination.pageSize)
|
||||||
@@ -83,8 +99,23 @@ public class IndexClient {
|
|||||||
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
static String[] tierNames = {
|
||||||
return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId()));
|
"OFF",
|
||||||
|
"DANGER",
|
||||||
|
"NSFW"
|
||||||
|
};
|
||||||
|
|
||||||
|
private boolean isBlacklisted(RpcDecoratedResultItem item, int filterTier) {
|
||||||
|
int domainId = UrlIdCodec.getDomainId(item.getRawItem().getCombinedId());
|
||||||
|
|
||||||
|
if (blacklist.isBlacklisted(domainId)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (nsfwDomainFilter.isBlocked(domainId, filterTier)) {
|
||||||
|
wmsa_index_query_count.labels(tierNames[filterTier]).inc();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1,9 +1,10 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
import nu.marginalia.index.forward.spans.IndexSpansReader;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -22,16 +23,15 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
|||||||
* and a mapping between document identifiers to the index into the
|
* and a mapping between document identifiers to the index into the
|
||||||
* data array.
|
* data array.
|
||||||
* <p/>
|
* <p/>
|
||||||
* Since the total data is relatively small, this is kept in memory to
|
|
||||||
* reduce the amount of disk thrashing.
|
|
||||||
* <p/>
|
|
||||||
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
||||||
*/
|
*/
|
||||||
public class ForwardIndexReader {
|
public class ForwardIndexReader {
|
||||||
private final LongArray ids;
|
private final LongArray ids;
|
||||||
private final LongArray data;
|
private final LongArray data;
|
||||||
|
|
||||||
private final ForwardIndexSpansReader spansReader;
|
private volatile Long2IntOpenHashMap idsMap;
|
||||||
|
|
||||||
|
private final IndexSpansReader spansReader;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@@ -64,7 +64,18 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
ids = loadIds(idsFile);
|
ids = loadIds(idsFile);
|
||||||
data = loadData(dataFile);
|
data = loadData(dataFile);
|
||||||
spansReader = new ForwardIndexSpansReader(spansFile);
|
|
||||||
|
spansReader = IndexSpansReader.open(spansFile);
|
||||||
|
|
||||||
|
Thread.ofPlatform().start(this::createIdsMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createIdsMap() {
|
||||||
|
Long2IntOpenHashMap idsMap = new Long2IntOpenHashMap((int) ids.size());
|
||||||
|
for (int i = 0; i < ids.size(); i++) {
|
||||||
|
idsMap.put(ids.get(i), i);
|
||||||
|
}
|
||||||
|
this.idsMap = idsMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static LongArray loadIds(Path idsFile) throws IOException {
|
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||||
@@ -106,6 +117,10 @@ public class ForwardIndexReader {
|
|||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
|
if (idsMap != null) {
|
||||||
|
return idsMap.getOrDefault(docId, -1);
|
||||||
|
}
|
||||||
|
|
||||||
long offset = ids.binarySearch(docId, 0, ids.size());
|
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||||
|
|
||||||
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||||
@@ -134,6 +149,27 @@ public class ForwardIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DocumentSpans[] getDocumentSpans(Arena arena, long[] docIds) {
|
||||||
|
long[] offsets = new long[docIds.length];
|
||||||
|
for (int i = 0; i < docIds.length; i++) {
|
||||||
|
long offset = idxForDoc(docIds[i]);
|
||||||
|
if (offset >= 0) {
|
||||||
|
offsets[i] = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
offsets[i] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return spansReader.readSpans(arena, offsets);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read spans for docIds", ex);
|
||||||
|
return new DocumentSpans[docIds.length];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return (int) ids.size();
|
return (int) ids.size();
|
||||||
}
|
}
|
||||||
@@ -141,6 +177,8 @@ public class ForwardIndexReader {
|
|||||||
public void close() {
|
public void close() {
|
||||||
if (data != null)
|
if (data != null)
|
||||||
data.close();
|
data.close();
|
||||||
|
if (ids != null)
|
||||||
|
ids.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isLoaded() {
|
public boolean isLoaded() {
|
||||||
|
@@ -5,7 +5,7 @@ import nu.marginalia.array.LongArray;
|
|||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexParameters;
|
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
import nu.marginalia.index.forward.spans.IndexSpansWriter;
|
||||||
import nu.marginalia.index.journal.IndexJournal;
|
import nu.marginalia.index.journal.IndexJournal;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
@@ -65,7 +65,7 @@ public class ForwardIndexConverter {
|
|||||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||||
|
|
||||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
|
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter");
|
||||||
var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData)
|
var spansWriter = new IndexSpansWriter(outputFileSpansData)
|
||||||
) {
|
) {
|
||||||
progress.progress(TaskSteps.GET_DOC_IDS);
|
progress.progress(TaskSteps.GET_DOC_IDS);
|
||||||
|
|
||||||
|
@@ -11,6 +11,9 @@ public class DocumentSpan {
|
|||||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||||
private final IntList startsEnds;
|
private final IntList startsEnds;
|
||||||
|
|
||||||
|
public DocumentSpan(IntList startsEnds) {
|
||||||
|
this.startsEnds = startsEnds;
|
||||||
|
}
|
||||||
public DocumentSpan(CodedSequence startsEnds) {
|
public DocumentSpan(CodedSequence startsEnds) {
|
||||||
this.startsEnds = startsEnds.values();
|
this.startsEnds = startsEnds.values();
|
||||||
}
|
}
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||||
import nu.marginalia.sequence.CodedSequence;
|
import nu.marginalia.sequence.CodedSequence;
|
||||||
|
|
||||||
@@ -39,6 +40,23 @@ public class DocumentSpans {
|
|||||||
return EMPTY_SPAN;
|
return EMPTY_SPAN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void accept(byte code, IntList positions) {
|
||||||
|
if (code == HtmlTag.HEADING.code)
|
||||||
|
this.heading = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.TITLE.code)
|
||||||
|
this.title = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.NAV.code)
|
||||||
|
this.nav = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.CODE.code)
|
||||||
|
this.code = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.ANCHOR.code)
|
||||||
|
this.anchor = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.EXTERNAL_LINKTEXT.code)
|
||||||
|
this.externalLinkText = new DocumentSpan(positions);
|
||||||
|
else if (code == HtmlTag.BODY.code)
|
||||||
|
this.body = new DocumentSpan(positions);
|
||||||
|
}
|
||||||
|
|
||||||
void accept(byte code, CodedSequence positions) {
|
void accept(byte code, CodedSequence positions) {
|
||||||
if (code == HtmlTag.HEADING.code)
|
if (code == HtmlTag.HEADING.code)
|
||||||
this.heading = new DocumentSpan(positions);
|
this.heading = new DocumentSpan(positions);
|
||||||
|
@@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public interface IndexSpansReader extends AutoCloseable {
|
||||||
|
DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException;
|
||||||
|
DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException;
|
||||||
|
|
||||||
|
static IndexSpansReader open(Path fileName) throws IOException {
|
||||||
|
int version = SpansCodec.parseSpanFilesFooter(fileName);
|
||||||
|
if (version == SpansCodec.SpansCodecVersion.COMPRESSED.ordinal()) {
|
||||||
|
return new IndexSpansReaderCompressed(fileName);
|
||||||
|
}
|
||||||
|
else if (version == SpansCodec.SpansCodecVersion.PLAIN.ordinal()) {
|
||||||
|
return new IndexSpansReaderPlain(fileName);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new IllegalArgumentException("Unsupported spans file version: " + version);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void close() throws IOException;
|
||||||
|
}
|
@@ -10,11 +10,11 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
@SuppressWarnings("preview")
|
@Deprecated
|
||||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
public class IndexSpansReaderCompressed implements AutoCloseable, IndexSpansReader {
|
||||||
private final FileChannel spansFileChannel;
|
private final FileChannel spansFileChannel;
|
||||||
|
|
||||||
public ForwardIndexSpansReader(Path spansFile) throws IOException {
|
public IndexSpansReaderCompressed(Path spansFile) throws IOException {
|
||||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,6 +51,17 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||||
|
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||||
|
for (int i = 0; i < encodedOffsets.length; i++) {
|
||||||
|
if (encodedOffsets[i] >= 0) {
|
||||||
|
ret[i] = readSpans(arena, encodedOffsets[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
spansFileChannel.close();
|
spansFileChannel.close();
|
@@ -0,0 +1,122 @@
|
|||||||
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.lang.foreign.MemorySegment;
|
||||||
|
import java.lang.foreign.ValueLayout;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
|
||||||
|
public class IndexSpansReaderPlain implements IndexSpansReader {
|
||||||
|
private final FileChannel[] spansFileChannels;
|
||||||
|
private final ForkJoinPool forkJoinPool;
|
||||||
|
|
||||||
|
public IndexSpansReaderPlain(Path spansFile) throws IOException {
|
||||||
|
this.spansFileChannels = new FileChannel[8];
|
||||||
|
for (int i = 0; i < spansFileChannels.length; i++) {
|
||||||
|
spansFileChannels[i] = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||||
|
}
|
||||||
|
forkJoinPool = new ForkJoinPool(spansFileChannels.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||||
|
// Decode the size and offset from the encoded offset
|
||||||
|
long size = SpansCodec.decodeSize(encodedOffset);
|
||||||
|
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||||
|
|
||||||
|
var ms = arena.allocate(size, 4);
|
||||||
|
// Allocate a buffer from the arena
|
||||||
|
var buffer = ms.asByteBuffer();
|
||||||
|
while (buffer.hasRemaining()) {
|
||||||
|
spansFileChannels[0].read(buffer, offset + buffer.position());
|
||||||
|
}
|
||||||
|
|
||||||
|
return decode(ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentSpans decode(MemorySegment ms) {
|
||||||
|
int count = ms.get(ValueLayout.JAVA_INT, 0);
|
||||||
|
int pos = 4;
|
||||||
|
DocumentSpans ret = new DocumentSpans();
|
||||||
|
|
||||||
|
// Decode each span
|
||||||
|
for (int spanIdx = 0; spanIdx < count; spanIdx++) {
|
||||||
|
byte code = ms.get(ValueLayout.JAVA_BYTE, pos);
|
||||||
|
short len = ms.get(ValueLayout.JAVA_SHORT, pos+2);
|
||||||
|
|
||||||
|
IntArrayList values = new IntArrayList(len);
|
||||||
|
|
||||||
|
pos += 4;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
values.add(ms.get(ValueLayout.JAVA_INT, pos + 4*i));
|
||||||
|
}
|
||||||
|
ret.accept(code, values);
|
||||||
|
pos += 4*len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public DocumentSpans[] readSpans(Arena arena, long[] encodedOffsets) throws IOException {
|
||||||
|
long totalSize = 0;
|
||||||
|
int numJobs = 0;
|
||||||
|
for (long offset : encodedOffsets) {
|
||||||
|
if (offset < 0)
|
||||||
|
continue;
|
||||||
|
totalSize += SpansCodec.decodeSize(offset);
|
||||||
|
numJobs++;
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentSpans[] ret = new DocumentSpans[encodedOffsets.length];
|
||||||
|
if (numJobs == 0) return ret;
|
||||||
|
|
||||||
|
CountDownLatch latch = new CountDownLatch(numJobs);
|
||||||
|
MemorySegment segment = arena.allocate(totalSize, 8);
|
||||||
|
|
||||||
|
long bufferOffset = 0;
|
||||||
|
for (int idx = 0; idx < encodedOffsets.length; idx++) {
|
||||||
|
long size = SpansCodec.decodeSize(encodedOffsets[idx]);
|
||||||
|
long start = SpansCodec.decodeStartOffset(encodedOffsets[idx]);
|
||||||
|
|
||||||
|
MemorySegment slice = segment.asSlice(bufferOffset, size);
|
||||||
|
bufferOffset += size;
|
||||||
|
|
||||||
|
int i = idx;
|
||||||
|
forkJoinPool.execute(() -> {
|
||||||
|
var buffer = slice.asByteBuffer();
|
||||||
|
try {
|
||||||
|
spansFileChannels[i% spansFileChannels.length].read(buffer, start);
|
||||||
|
ret[i] = decode(slice);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
latch.countDown();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
latch.await();
|
||||||
|
}
|
||||||
|
catch (InterruptedException ex) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
for (var spansFileChannel : spansFileChannels) {
|
||||||
|
spansFileChannel.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -1,20 +1,23 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.VarintCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class ForwardIndexSpansWriter implements AutoCloseable {
|
public class IndexSpansWriter implements AutoCloseable {
|
||||||
private final FileChannel outputChannel;
|
private final FileChannel outputChannel;
|
||||||
private final ByteBuffer work = ByteBuffer.allocate(32);
|
private final ByteBuffer work = ByteBuffer.allocate(65536).order(ByteOrder.nativeOrder());
|
||||||
|
|
||||||
private long stateStartOffset = -1;
|
private long stateStartOffset = -1;
|
||||||
private int stateLength = -1;
|
private int stateLength = -1;
|
||||||
|
|
||||||
public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException {
|
public IndexSpansWriter(Path outputFileSpansData) throws IOException {
|
||||||
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
|
this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,7 +26,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
stateLength = 0;
|
stateLength = 0;
|
||||||
|
|
||||||
work.clear();
|
work.clear();
|
||||||
work.put((byte) count);
|
work.putInt(count);
|
||||||
work.flip();
|
work.flip();
|
||||||
|
|
||||||
while (work.hasRemaining())
|
while (work.hasRemaining())
|
||||||
@@ -33,12 +36,17 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
|
public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException {
|
||||||
work.clear();
|
work.clear();
|
||||||
work.put(spanCode);
|
work.put(spanCode);
|
||||||
work.putShort((short) sequenceData.remaining());
|
work.put((byte) 0); // Ensure we're byte aligned
|
||||||
|
var sequence = new VarintCodedSequence(sequenceData);
|
||||||
|
work.putShort((short) sequence.valueCount());
|
||||||
|
|
||||||
|
var iter = sequence.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
work.putInt(iter.nextInt());
|
||||||
|
}
|
||||||
work.flip();
|
work.flip();
|
||||||
|
|
||||||
while (work.hasRemaining() || sequenceData.hasRemaining()) {
|
stateLength += outputChannel.write(work);
|
||||||
stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public long endRecord() {
|
public long endRecord() {
|
||||||
@@ -47,6 +55,11 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
ByteBuffer footer = SpansCodec.createSpanFilesFooter(SpansCodec.SpansCodecVersion.PLAIN);
|
||||||
|
outputChannel.position(outputChannel.size());
|
||||||
|
while (footer.hasRemaining()) {
|
||||||
|
outputChannel.write(footer, outputChannel.size());
|
||||||
|
}
|
||||||
outputChannel.close();
|
outputChannel.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -1,6 +1,21 @@
|
|||||||
package nu.marginalia.index.forward.spans;
|
package nu.marginalia.index.forward.spans;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class SpansCodec {
|
public class SpansCodec {
|
||||||
|
public static int MAGIC_INT = 0xF000F000;
|
||||||
|
public static int FOOTER_SIZE = 8;
|
||||||
|
|
||||||
|
enum SpansCodecVersion {
|
||||||
|
@Deprecated
|
||||||
|
COMPRESSED,
|
||||||
|
PLAIN
|
||||||
|
}
|
||||||
|
|
||||||
public static long encode(long startOffset, long size) {
|
public static long encode(long startOffset, long size) {
|
||||||
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
||||||
|
|
||||||
@@ -14,4 +29,31 @@ public class SpansCodec {
|
|||||||
public static long decodeSize(long encoded) {
|
public static long decodeSize(long encoded) {
|
||||||
return encoded & 0x0FFF_FFFFL;
|
return encoded & 0x0FFF_FFFFL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static ByteBuffer createSpanFilesFooter(SpansCodecVersion version) {
|
||||||
|
ByteBuffer footer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||||
|
footer.putInt(SpansCodec.MAGIC_INT);
|
||||||
|
footer.put((byte) version.ordinal());
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.put((byte) 0);
|
||||||
|
footer.flip();
|
||||||
|
return footer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int parseSpanFilesFooter(Path spansFile) throws IOException {
|
||||||
|
ByteBuffer buffer = ByteBuffer.allocate(FOOTER_SIZE);
|
||||||
|
|
||||||
|
try (var fc = FileChannel.open(spansFile, StandardOpenOption.READ)) {
|
||||||
|
if (fc.size() < FOOTER_SIZE) return 0;
|
||||||
|
fc.read(buffer, fc.size() - buffer.capacity());
|
||||||
|
buffer.flip();
|
||||||
|
int magic = buffer.getInt();
|
||||||
|
if (magic != MAGIC_INT) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return buffer.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user