mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
132 Commits
v24.10.0-r
...
deploy-000
Author | SHA1 | Date | |
---|---|---|---|
|
41a59dcf45 | ||
|
94d4d2edb7 | ||
|
7ae19a92ba | ||
|
56d14e56d7 | ||
|
a557c7ae7f | ||
|
b66879ccb1 | ||
|
f1b7157ca2 | ||
|
7622335e84 | ||
|
0da2047eae | ||
|
5ee4321110 | ||
|
9459b9933b | ||
|
87fb564f89 | ||
|
5ca8523220 | ||
|
1118657ffd | ||
|
b1f970152d | ||
|
e1783891ab | ||
|
64d32471dd | ||
|
232cc465d9 | ||
|
8c963bd4ba | ||
|
6a079c1c75 | ||
|
2dc9f2e639 | ||
|
b66fb9caf6 | ||
|
eb2fe18867 | ||
|
a7468c8d23 | ||
|
fb2beb1eac | ||
|
0fb03e3d62 | ||
|
67db3f295e | ||
|
dafaab3ef7 | ||
|
3f11ca409f | ||
|
694eed79ef | ||
|
4220169119 | ||
|
0a53ac68a0 | ||
|
e65d75a0f9 | ||
|
3b99cffb3d | ||
|
a97c05107e | ||
|
5002870d1f | ||
|
73861e613f | ||
|
461bc3eb1a | ||
|
cf7f84f033 | ||
|
9fc82574f0 | ||
|
589f4dafb9 | ||
|
c5d657ef98 | ||
|
3c2bb566da | ||
|
e0c0ed27bc | ||
|
20abb91657 | ||
|
291ca8daf1 | ||
|
ee2d5496d0 | ||
|
5c858a2b94 | ||
|
fdc3efa250 | ||
|
5fdd2c71f8 | ||
|
c97c66a41c | ||
|
7b64377fd6 | ||
|
e11ebf18e5 | ||
|
ba47d72bf4 | ||
|
52bc0272f8 | ||
|
d4bce13a03 | ||
|
b9842b57e0 | ||
|
95776e9bee | ||
|
077d8dcd11 | ||
|
9ec41e27c6 | ||
|
200743c84f | ||
|
6d7998e349 | ||
|
7d1ef08a0f | ||
|
ea6b148df2 | ||
|
3ec9c4c5fa | ||
|
0b6b5dab07 | ||
|
ff17473105 | ||
|
dc5f97e737 | ||
|
d919179ba3 | ||
|
f09669a5b0 | ||
|
b3b0f6fed3 | ||
|
88caca60f9 | ||
|
923ebbac81 | ||
|
df298df852 | ||
|
552b246099 | ||
|
80e6d0069c | ||
|
b941604135 | ||
|
52eb5bc84f | ||
|
4d23fe6261 | ||
|
14519294d2 | ||
|
51e46ad2b0 | ||
|
665c8831a3 | ||
|
47dfbacb00 | ||
|
f94911541a | ||
|
89d8af640d | ||
|
6e4252cf4c | ||
|
79ce4de2ab | ||
|
d6575dfee4 | ||
|
a91ab4c203 | ||
|
6a3079a167 | ||
|
c728a1e2f2 | ||
|
d874d76a09 | ||
|
70bc8831f5 | ||
|
41c11be075 | ||
|
163ce19846 | ||
|
9eb16cb667 | ||
|
af40fa327b | ||
|
cf6d28e71e | ||
|
3791ea1e18 | ||
|
34258b92d1 | ||
|
e5db3f11e1 | ||
|
9f47ce8d15 | ||
|
a5b4951f23 | ||
|
8b8bf0748f | ||
|
5cc71ae586 | ||
|
33fcfe4b63 | ||
|
a31a3b53c4 | ||
|
a456ec9599 | ||
|
a2bc9a98c0 | ||
|
e24a98390c | ||
|
6f858cd627 | ||
|
a293266ccd | ||
|
b8e0dc93d7 | ||
|
d774c39031 | ||
|
ab17af99da | ||
|
b0ac3c586f | ||
|
139fa85b18 | ||
|
bfeb9a4538 | ||
|
3d6c79ae5f | ||
|
c9e9f73ea9 | ||
|
80e482b155 | ||
|
9351593495 | ||
|
d74436f546 | ||
|
76e9053dd0 | ||
|
dbb8bcdd8e | ||
|
7305afa0f8 | ||
|
481f999b70 | ||
|
4b16022556 | ||
|
89dd201a7b | ||
|
ab486323f2 | ||
|
6460c11107 | ||
|
89f7f3c17c |
@@ -49,13 +49,15 @@ associated with each language added, at least a models file or two, as well as s
|
||||
|
||||
It would be very helpful to find a speaker of a large language other than English to help in the fine tuning.
|
||||
|
||||
## Finalize RSS support
|
||||
## Finalize RSS support (COMPLETED 2024-11)
|
||||
|
||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||
it should be extended to all domains. It would also be interesting to offer search of the
|
||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||
main dataset.
|
||||
|
||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122)
|
||||
|
||||
## Support for binary formats like PDF
|
||||
|
||||
The crawler needs to be modified to retain them, and the conversion logic needs to parse them.
|
||||
|
@@ -1,7 +1,6 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "io.freefair.lombok" version "8.3"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
@@ -44,8 +43,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
jvmVersion=23
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
|
@@ -1,17 +1,13 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Builder
|
||||
public class LanguageModels {
|
||||
public final Path termFrequencies;
|
||||
|
||||
public final Path openNLPSentenceDetectionData;
|
||||
public final Path posRules;
|
||||
public final Path posDict;
|
||||
public final Path openNLPTokenData;
|
||||
public final Path fasttextLanguageModel;
|
||||
public final Path segments;
|
||||
|
||||
@@ -19,15 +15,67 @@ public class LanguageModels {
|
||||
Path openNLPSentenceDetectionData,
|
||||
Path posRules,
|
||||
Path posDict,
|
||||
Path openNLPTokenData,
|
||||
Path fasttextLanguageModel,
|
||||
Path segments) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
this.posRules = posRules;
|
||||
this.posDict = posDict;
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
|
||||
public static LanguageModelsBuilder builder() {
|
||||
return new LanguageModelsBuilder();
|
||||
}
|
||||
|
||||
public static class LanguageModelsBuilder {
|
||||
private Path termFrequencies;
|
||||
private Path openNLPSentenceDetectionData;
|
||||
private Path posRules;
|
||||
private Path posDict;
|
||||
private Path fasttextLanguageModel;
|
||||
private Path segments;
|
||||
|
||||
LanguageModelsBuilder() {
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posRules(Path posRules) {
|
||||
this.posRules = posRules;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posDict(Path posDict) {
|
||||
this.posDict = posDict;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder segments(Path segments) {
|
||||
this.segments = segments;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModels build() {
|
||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -75,6 +75,10 @@ public class WmsaHome {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static Path getDataPath() {
|
||||
return getHomePath().resolve("data");
|
||||
}
|
||||
|
||||
public static Path getAdsDefinition() {
|
||||
return getHomePath().resolve("data").resolve("adblock.txt");
|
||||
}
|
||||
@@ -100,7 +104,6 @@ public class WmsaHome {
|
||||
home.resolve("model/opennlp-sentence.bin"),
|
||||
home.resolve("model/English.RDR"),
|
||||
home.resolve("model/English.DICT"),
|
||||
home.resolve("model/opennlp-tok.bin"),
|
||||
home.resolve("model/lid.176.ftz"),
|
||||
home.resolve("model/segments.bin")
|
||||
);
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.nodecfg;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -20,10 +21,10 @@ public class NodeConfigurationService {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs) throws SQLException {
|
||||
public NodeConfiguration create(int id, String description, boolean acceptQueries, boolean keepWarcs, NodeProfile nodeProfile) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var is = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS) VALUES(?, ?, ?, ?)
|
||||
INSERT IGNORE INTO NODE_CONFIGURATION(ID, DESCRIPTION, ACCEPT_QUERIES, KEEP_WARCS, NODE_PROFILE) VALUES(?, ?, ?, ?, ?)
|
||||
""")
|
||||
)
|
||||
{
|
||||
@@ -31,6 +32,7 @@ public class NodeConfigurationService {
|
||||
is.setString(2, description);
|
||||
is.setBoolean(3, acceptQueries);
|
||||
is.setBoolean(4, keepWarcs);
|
||||
is.setString(5, nodeProfile.name());
|
||||
|
||||
if (is.executeUpdate() <= 0) {
|
||||
throw new IllegalStateException("Failed to insert configuration");
|
||||
@@ -43,7 +45,7 @@ public class NodeConfigurationService {
|
||||
public List<NodeConfiguration> getAll() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
""")) {
|
||||
var rs = qs.executeQuery();
|
||||
@@ -58,6 +60,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
));
|
||||
}
|
||||
@@ -72,7 +75,7 @@ public class NodeConfigurationService {
|
||||
public NodeConfiguration get(int nodeId) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var qs = conn.prepareStatement("""
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, DISABLED
|
||||
SELECT ID, DESCRIPTION, ACCEPT_QUERIES, AUTO_CLEAN, PRECESSION, KEEP_WARCS, NODE_PROFILE, DISABLED
|
||||
FROM NODE_CONFIGURATION
|
||||
WHERE ID=?
|
||||
""")) {
|
||||
@@ -86,6 +89,7 @@ public class NodeConfigurationService {
|
||||
rs.getBoolean("AUTO_CLEAN"),
|
||||
rs.getBoolean("PRECESSION"),
|
||||
rs.getBoolean("KEEP_WARCS"),
|
||||
NodeProfile.valueOf(rs.getString("NODE_PROFILE")),
|
||||
rs.getBoolean("DISABLED")
|
||||
);
|
||||
}
|
||||
@@ -98,7 +102,7 @@ public class NodeConfigurationService {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var us = conn.prepareStatement("""
|
||||
UPDATE NODE_CONFIGURATION
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?
|
||||
SET DESCRIPTION=?, ACCEPT_QUERIES=?, AUTO_CLEAN=?, PRECESSION=?, KEEP_WARCS=?, DISABLED=?, NODE_PROFILE=?
|
||||
WHERE ID=?
|
||||
"""))
|
||||
{
|
||||
@@ -108,7 +112,8 @@ public class NodeConfigurationService {
|
||||
us.setBoolean(4, config.includeInPrecession());
|
||||
us.setBoolean(5, config.keepWarcs());
|
||||
us.setBoolean(6, config.disabled());
|
||||
us.setInt(7, config.node());
|
||||
us.setString(7, config.profile().name());
|
||||
us.setInt(8, config.node());
|
||||
|
||||
if (us.executeUpdate() <= 0)
|
||||
throw new IllegalStateException("Failed to update configuration");
|
||||
|
@@ -6,6 +6,7 @@ public record NodeConfiguration(int node,
|
||||
boolean autoClean,
|
||||
boolean includeInPrecession,
|
||||
boolean keepWarcs,
|
||||
NodeProfile profile,
|
||||
boolean disabled
|
||||
)
|
||||
{
|
||||
|
@@ -0,0 +1,28 @@
|
||||
package nu.marginalia.nodecfg.model;
|
||||
|
||||
public enum NodeProfile {
|
||||
BATCH_CRAWL,
|
||||
REALTIME,
|
||||
MIXED,
|
||||
SIDELOAD;
|
||||
|
||||
public boolean isBatchCrawl() {
|
||||
return this == BATCH_CRAWL;
|
||||
}
|
||||
public boolean isRealtime() {
|
||||
return this == REALTIME;
|
||||
}
|
||||
public boolean isMixed() {
|
||||
return this == MIXED;
|
||||
}
|
||||
public boolean isSideload() {
|
||||
return this == SIDELOAD;
|
||||
}
|
||||
|
||||
public boolean permitBatchCrawl() {
|
||||
return isBatchCrawl() ||isMixed();
|
||||
}
|
||||
public boolean permitSideload() {
|
||||
return isMixed() || isSideload();
|
||||
}
|
||||
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.nodecfg;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.test.TestMigrationLoader;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -46,8 +47,8 @@ public class NodeConfigurationServiceTest {
|
||||
|
||||
@Test
|
||||
public void test() throws SQLException {
|
||||
var a = nodeConfigurationService.create(1, "Test", false, false);
|
||||
var b = nodeConfigurationService.create(2, "Foo", true, false);
|
||||
var a = nodeConfigurationService.create(1, "Test", false, false, NodeProfile.MIXED);
|
||||
var b = nodeConfigurationService.create(2, "Foo", true, false, NodeProfile.MIXED);
|
||||
|
||||
assertEquals(1, a.node());
|
||||
assertEquals("Test", a.description());
|
||||
|
@@ -7,12 +7,13 @@ import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
@@ -27,7 +28,6 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@@ -42,12 +42,14 @@ public class DbDomainQueries {
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@@ -70,11 +72,13 @@ public class DbDomainQueries {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return OptionalInt.empty();
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@@ -87,5 +91,11 @@ public class DbDomainQueries {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.With;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -115,23 +114,23 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
/**
|
||||
* Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
*/
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
String definition) {
|
||||
|
||||
public Path fileName(Path base) {
|
||||
return base.resolve(name().toLowerCase() + ".dat");
|
||||
}
|
||||
|
||||
public String[] domains() {
|
||||
return Arrays.stream(definition().split("\n+"))
|
||||
.map(String::trim)
|
||||
@@ -144,5 +143,20 @@ public class DomainRankingSetsService {
|
||||
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||
}
|
||||
|
||||
public DomainRankingSet withName(String name) {
|
||||
return this.name == name ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDescription(String description) {
|
||||
return this.description == description ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDepth(int depth) {
|
||||
return this.depth == depth ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDefinition(String definition) {
|
||||
return this.definition == definition ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1 @@
|
||||
ALTER TABLE WMSA_prod.NODE_CONFIGURATION ADD COLUMN NODE_PROFILE VARCHAR(255) DEFAULT 'MIXED';
|
@@ -1,15 +1,12 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain implements Serializable {
|
||||
|
||||
@Nonnull
|
||||
@@ -17,7 +14,6 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
@@ -34,28 +30,23 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
{ // Capture .ac.jp, .co.uk
|
||||
} else {
|
||||
if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk
|
||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
@@ -64,6 +55,12 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
|
||||
public EdgeDomain(@Nonnull String subDomain, @Nonnull String topDomain) {
|
||||
this.subDomain = subDomain;
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@@ -91,11 +88,11 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrlHttp() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
|
||||
public EdgeUrl toRootUrlHttps() {
|
||||
return new EdgeUrl("https", this, null, "/", null);
|
||||
}
|
||||
@@ -125,8 +122,7 @@ public class EdgeDomain implements Serializable {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
@@ -138,6 +134,18 @@ public class EdgeDomain implements Serializable {
|
||||
return ret.toString().toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
public Optional<EdgeDomain> aliasDomain() {
|
||||
if (subDomain.equals("www")) {
|
||||
return Optional.of(new EdgeDomain("", topDomain));
|
||||
} else if (subDomain.isBlank()){
|
||||
return Optional.of(new EdgeDomain("www", topDomain));
|
||||
}
|
||||
else return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||
if (other == null) return false;
|
||||
@@ -155,16 +163,14 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
@@ -174,10 +180,10 @@ public class EdgeDomain implements Serializable {
|
||||
if (!(o instanceof EdgeDomain other)) return false;
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
if (!Objects.equals(this$subDomain, other$subDomain)) return false;
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
if (!Objects.equals(this$domain, other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -191,4 +197,13 @@ public class EdgeDomain implements Serializable {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getSubDomain() {
|
||||
return this.subDomain;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getTopDomain() {
|
||||
return this.topDomain;
|
||||
}
|
||||
}
|
||||
|
@@ -1,8 +1,5 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.util.QueryParams;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
@@ -15,7 +12,6 @@ import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
@@ -38,9 +34,8 @@ public class EdgeUrl implements Serializable {
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException(STR."Failed to parse URI '\{url}'", ex.getMessage());
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,20 +78,17 @@ public class EdgeUrl implements Serializable {
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else if (c == '%' && i+2<end) {
|
||||
int cn = url.charAt(i+1);
|
||||
int cnn = url.charAt(i+2);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
@@ -109,7 +101,7 @@ public class EdgeUrl implements Serializable {
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx+2);
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
@@ -125,8 +117,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URI);
|
||||
throw ex;
|
||||
}
|
||||
@@ -145,8 +136,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URL.getProtocol().toLowerCase();
|
||||
this.port = port(URL.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URL);
|
||||
throw ex;
|
||||
}
|
||||
@@ -158,8 +148,7 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
if (protocol.equals("http") && port == 80) {
|
||||
return null;
|
||||
}
|
||||
else if (protocol.equals("https") && port == 443) {
|
||||
} else if (protocol.equals("https") && port == 443) {
|
||||
return null;
|
||||
}
|
||||
return port;
|
||||
@@ -190,12 +179,13 @@ public class EdgeUrl implements Serializable {
|
||||
public String dir() {
|
||||
return path.replaceAll("/[^/]+$", "/");
|
||||
}
|
||||
|
||||
public String fileName() {
|
||||
return path.replaceAll(".*/", "");
|
||||
}
|
||||
|
||||
public int depth() {
|
||||
return (int) path.chars().filter(c -> c=='/').count();
|
||||
return (int) path.chars().filter(c -> c == '/').count();
|
||||
}
|
||||
|
||||
public EdgeUrl withPathAndParam(String path, String param) {
|
||||
@@ -207,8 +197,8 @@ public class EdgeUrl implements Serializable {
|
||||
if (other == this) return true;
|
||||
if (other instanceof EdgeUrl e) {
|
||||
return Objects.equals(e.domain, domain)
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -235,8 +225,7 @@ public class EdgeUrl implements Serializable {
|
||||
public URL asURL() throws MalformedURLException {
|
||||
try {
|
||||
return asURI().toURL();
|
||||
}
|
||||
catch (URISyntaxException e) {
|
||||
} catch (URISyntaxException e) {
|
||||
throw new MalformedURLException(e.getMessage());
|
||||
}
|
||||
}
|
||||
@@ -248,4 +237,13 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public String getProto() {
|
||||
return this.proto;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -16,6 +16,9 @@ public enum HtmlFeature {
|
||||
KEBAB_CASE_URL("special:kcurl"), // https://www.example.com/urls-that-look-like-this/
|
||||
LONG_URL("special:longurl"),
|
||||
|
||||
CLOUDFLARE_FEATURE("special:cloudflare"),
|
||||
CDN_FEATURE("special:cdn"),
|
||||
|
||||
VIEWPORT("special:viewport"),
|
||||
|
||||
COOKIES("special:cookies"),
|
||||
@@ -60,6 +63,8 @@ public enum HtmlFeature {
|
||||
DOFOLLOW_LINK("special:dofollow"),
|
||||
APPLE_TOUCH_ICON("special:appleicon"),
|
||||
|
||||
S3_FEATURE("special:s3"),
|
||||
|
||||
UNKNOWN("special:uncategorized");
|
||||
|
||||
|
||||
|
@@ -1,34 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
|
||||
implementation libs.guava
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation libs.snakeyaml
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
@@ -1,7 +0,0 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
void close();
|
||||
}
|
@@ -1,52 +0,0 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
private final Path logFile;
|
||||
private final Function<WorkLogEntry, Optional<T>> mapper;
|
||||
|
||||
WorkLoadIterable(Path logFile, Function<WorkLogEntry, Optional<T>> mapper) {
|
||||
this.logFile = logFile;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public Iterator<T> iterator() {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
@@ -1,4 +0,0 @@
|
||||
# Process
|
||||
|
||||
Basic functionality for a Process. Processes must include this dependency to ensure
|
||||
their loggers are configured properly!
|
@@ -1,9 +0,0 @@
|
||||
log4j2.isThreadContextMapInheritable=true
|
||||
status = info
|
||||
appender.console.type = Console
|
||||
appender.console.name = LogToConsole
|
||||
appender.console.layout.type = PatternLayout
|
||||
appender.console.layout.pattern = %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %c{1}- %msg{nolookups}%n
|
||||
appender.console.filter.http.type = MarkerFilter
|
||||
rootLogger.level = info
|
||||
rootLogger.appenderRef.console.ref = LogToConsole
|
@@ -4,12 +4,12 @@ import com.github.jknack.handlebars.*;
|
||||
import com.github.jknack.handlebars.helper.ConditionalHelpers;
|
||||
import com.github.jknack.handlebars.io.ClassPathTemplateLoader;
|
||||
import com.github.jknack.handlebars.io.TemplateLoader;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@@ -42,22 +42,35 @@ public class MustacheRenderer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model) {
|
||||
return template.apply(model);
|
||||
try {
|
||||
return template.apply(model);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public <T2> String render(T model, String name, List<T2> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(name, children).build();
|
||||
|
||||
return template.apply(ctx);
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model, Map<String, ?> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(children).build();
|
||||
return template.apply(ctx);
|
||||
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia;
|
||||
package nu.marginalia.process;
|
||||
|
||||
import java.util.UUID;
|
||||
|
@@ -1,4 +1,4 @@
|
||||
package nu.marginalia;
|
||||
package nu.marginalia.process;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
@@ -0,0 +1,102 @@
|
||||
package nu.marginalia.process;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||
import nu.marginalia.service.ConfigLoader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public abstract class ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
private final int node;
|
||||
private final String inboxName;
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
private final Gson gson;
|
||||
|
||||
public ProcessMainClass(MessageQueueFactory messageQueueFactory,
|
||||
ProcessConfiguration config,
|
||||
Gson gson,
|
||||
String inboxName
|
||||
) {
|
||||
this.gson = gson;
|
||||
new org.mariadb.jdbc.Driver();
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.node = config.node();
|
||||
this.inboxName = inboxName;
|
||||
}
|
||||
|
||||
|
||||
protected <T> Instructions<T> fetchInstructions(Class<T> requestType) throws Exception {
|
||||
|
||||
var inbox = messageQueueFactory.createSingleShotInbox(inboxName, node, UUID.randomUUID());
|
||||
|
||||
logger.info("Waiting for instructions");
|
||||
|
||||
var msgOpt = getMessage(inbox, requestType.getSimpleName());
|
||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||
|
||||
// for live crawl, request is empty for now
|
||||
T request = gson.fromJson(msg.payload(), requestType);
|
||||
|
||||
return new Instructions<>(msg, inbox, request);
|
||||
}
|
||||
|
||||
|
||||
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws InterruptedException, SQLException {
|
||||
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
||||
if (opt.isPresent()) {
|
||||
if (!opt.get().function().equals(expectedFunction)) {
|
||||
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
||||
}
|
||||
return opt;
|
||||
}
|
||||
else {
|
||||
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
||||
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
||||
return stolenMessage;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected static class Instructions<T> {
|
||||
private final MqMessage message;
|
||||
private final MqSingleShotInbox inbox;
|
||||
private final T value;
|
||||
Instructions(MqMessage message, MqSingleShotInbox inbox, T value)
|
||||
{
|
||||
this.message = message;
|
||||
this.inbox = inbox;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public T value() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void ok() {
|
||||
inbox.sendResponse(message, MqInboxResponse.ok());
|
||||
}
|
||||
public void err() {
|
||||
inbox.sendResponse(message, MqInboxResponse.err());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -3,6 +3,8 @@ package nu.marginalia.process.control;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
||||
@@ -30,6 +32,11 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
logger.info("Progress: {}, {}/{}", step, progress, total);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||
return collection;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
@@ -0,0 +1,12 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
public interface ProcessAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
<T> Iterable<T> wrap(String step, Collection<T> collection);
|
||||
|
||||
void close();
|
||||
}
|
@@ -2,11 +2,13 @@ package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@@ -69,6 +71,35 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
logger.info("ProcessTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
|
||||
/** Wrap a collection to provide heartbeat progress updates as it's iterated through */
|
||||
@Override
|
||||
public <T> Iterable<T> wrap(String step, Collection<T> collection) {
|
||||
return () -> new Iterator<>() {
|
||||
private final Iterator<T> base = collection.iterator();
|
||||
private final int size = collection.size();
|
||||
private final int updateInterval = Math.max(1, size / 100); // update every 1% of the collection, or at least once
|
||||
private int pos = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
boolean ret = base.hasNext();
|
||||
if (!ret) {
|
||||
progress(step, size, size);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
// update every 1% of the collection, to avoid hammering the database with updates
|
||||
if (pos++ % updateInterval == 0) {
|
||||
progress(step, pos, size);
|
||||
}
|
||||
return base.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
@@ -185,6 +216,5 @@ public class ProcessAdHocTaskHeartbeatImpl implements AutoCloseable, ProcessAdHo
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -4,17 +4,18 @@ package nu.marginalia.process.control;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This service sends a heartbeat to the database every 5 seconds.
|
||||
*/
|
||||
@Singleton
|
||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
||||
public class ProcessHeartbeatImpl implements ProcessHeartbeat, Closeable {
|
||||
private final Logger logger = LoggerFactory.getLogger(ProcessHeartbeatImpl.class);
|
||||
private final String processName;
|
||||
private final String processBase;
|
||||
@@ -169,5 +170,9 @@ public class ProcessHeartbeatImpl implements ProcessHeartbeat {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,7 @@ package nu.marginalia.process.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
private final Path logFile;
|
||||
private final Function<WorkLogEntry, Optional<T>> mapper;
|
||||
|
||||
WorkLoadIterable(Path logFile, Function<WorkLogEntry, Optional<T>> mapper) {
|
||||
this.logFile = logFile;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
try {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
@@ -33,6 +33,6 @@ public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
||||
|
||||
String relPath = fileName();
|
||||
|
||||
return STR."\{relPath.substring(0, 2)}/\{relPath.substring(2, 4)}/\{relPath}";
|
||||
return relPath.substring(0, 2) + "/" + relPath.substring(2, 4) + "/" + relPath;
|
||||
}
|
||||
}
|
@@ -9,11 +9,11 @@ import java.util.Properties;
|
||||
|
||||
public class ConfigLoader {
|
||||
|
||||
static Path getConfigPath(String configName) {
|
||||
public static Path getConfigPath(String configName) {
|
||||
return WmsaHome.getHomePath().resolve("conf/properties/" + configName + ".properties");
|
||||
}
|
||||
|
||||
static void loadConfig(Path configPath) {
|
||||
public static void loadConfig(Path configPath) {
|
||||
if (!Files.exists(configPath)) {
|
||||
System.err.println("No config file found at " + configPath);
|
||||
return;
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.service;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -26,7 +25,6 @@ public class NodeConfigurationWatcher {
|
||||
watcherThread.start();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void pollConfiguration() {
|
||||
for (;;) {
|
||||
List<Integer> goodNodes = new ArrayList<>();
|
||||
@@ -34,7 +32,7 @@ public class NodeConfigurationWatcher {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID FROM NODE_CONFIGURATION
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
""");
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
@@ -47,7 +45,12 @@ public class NodeConfigurationWatcher {
|
||||
|
||||
queryNodes = goodNodes;
|
||||
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,20 +0,0 @@
|
||||
package nu.marginalia.service;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public abstract class ProcessMainClass {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ProcessMainClass.class);
|
||||
|
||||
static {
|
||||
// Load global config ASAP
|
||||
ConfigLoader.loadConfig(
|
||||
ConfigLoader.getConfigPath("system")
|
||||
);
|
||||
}
|
||||
|
||||
public ProcessMainClass() {
|
||||
new org.mariadb.jdbc.Driver();
|
||||
}
|
||||
|
||||
}
|
@@ -12,7 +12,11 @@ public enum ServiceId {
|
||||
Control("control-service"),
|
||||
|
||||
Dating("dating-service"),
|
||||
Explorer("explorer-service");
|
||||
Status("setatus-service"),
|
||||
Explorer("explorer-service"),
|
||||
|
||||
NOT_A_SERVICE("NOT_A_SERVICE")
|
||||
;
|
||||
|
||||
public final String serviceName;
|
||||
|
||||
|
@@ -4,13 +4,13 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.ManagedChannelBuilder;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.function.Function;
|
||||
@@ -48,7 +48,12 @@ public class GrpcChannelPoolFactory {
|
||||
public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key,
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
{
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private ManagedChannel createChannel(InstanceAddress route) {
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.client;
|
||||
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -12,7 +11,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
@@ -29,7 +31,6 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<ServicePartition.Multi> serviceKey,
|
||||
Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor,
|
||||
@@ -52,11 +53,16 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
}
|
||||
|
||||
private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the list of nodes that are eligible for broadcast-style requests */
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@@ -14,7 +13,9 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
@@ -32,11 +33,12 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||
Function<ManagedChannel, STUB> stubConstructor) {
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
throws Exception
|
||||
{
|
||||
super(serviceKey);
|
||||
|
||||
this.serviceRegistryIf = serviceRegistryIf;
|
||||
@@ -112,7 +114,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error(STR."Failed to get channel for \{address}", e);
|
||||
logger.error("Failed to get channel for " + address, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@@ -4,6 +4,6 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
public class ServiceNotAvailableException extends RuntimeException {
|
||||
public ServiceNotAvailableException(ServiceKey<?> key) {
|
||||
super(STR."Service \{key} not available");
|
||||
super("Service " + key + " not available");
|
||||
}
|
||||
}
|
||||
|
@@ -11,4 +11,14 @@ public class FakeServiceHeartbeat implements ServiceHeartbeat {
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName) {
|
||||
return new ServiceAdHocTaskHeartbeat() {
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {}
|
||||
@Override
|
||||
public void close() {}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,7 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
public interface ServiceAdHocTaskHeartbeat extends AutoCloseable {
|
||||
void progress(String step, int progress, int total);
|
||||
|
||||
void close();
|
||||
}
|
@@ -0,0 +1,190 @@
|
||||
package nu.marginalia.service.control;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** This object sends a heartbeat to the database every few seconds,
|
||||
* updating with the progress of a task within a service. Progress is tracked by providing
|
||||
* enumerations corresponding to the steps in the task. It's important they're arranged in the same
|
||||
* order as the steps in the task in order to get an accurate progress tracking.
|
||||
*/
|
||||
public class ServiceAdHocTaskHeartbeatImpl implements AutoCloseable, ServiceAdHocTaskHeartbeat {
|
||||
private final Logger logger = LoggerFactory.getLogger(ServiceAdHocTaskHeartbeatImpl.class);
|
||||
private final String taskName;
|
||||
private final String taskBase;
|
||||
private final int node;
|
||||
private final String instanceUUID;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
|
||||
private final Thread runnerThread;
|
||||
private final int heartbeatInterval = Integer.getInteger("mcp.heartbeat.interval", 1);
|
||||
private final String serviceInstanceUUID;
|
||||
private int progress;
|
||||
|
||||
private volatile boolean running = false;
|
||||
private volatile String step = "-";
|
||||
|
||||
ServiceAdHocTaskHeartbeatImpl(ServiceConfiguration configuration,
|
||||
String taskName,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
this.taskName = configuration.serviceName() + "." + taskName + ":" + configuration.node();
|
||||
this.taskBase = configuration.serviceName() + "." + taskName;
|
||||
this.node = configuration.node();
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.instanceUUID = UUID.randomUUID().toString();
|
||||
this.serviceInstanceUUID = configuration.instanceUuid().toString();
|
||||
|
||||
heartbeatInit();
|
||||
|
||||
runnerThread = new Thread(this::run);
|
||||
runnerThread.start();
|
||||
}
|
||||
|
||||
/** Update the progress of the task. This is a fast function that doesn't block;
|
||||
* the actual update is done in a separate thread.
|
||||
*
|
||||
* @param step The current step in the task.
|
||||
*/
|
||||
@Override
|
||||
public void progress(String step, int stepProgress, int stepCount) {
|
||||
this.step = step;
|
||||
|
||||
|
||||
// off by one since we calculate the progress based on the number of steps,
|
||||
// and Enum.ordinal() is zero-based (so the 5th step in a 5 step task is 4, not 5; resulting in the
|
||||
// final progress being 80% and not 100%)
|
||||
|
||||
this.progress = (int) Math.round(100. * stepProgress / (double) stepCount);
|
||||
|
||||
logger.info("ServiceTask {} progress: {}%", taskBase, progress);
|
||||
}
|
||||
|
||||
public void shutDown() {
|
||||
if (!running)
|
||||
return;
|
||||
|
||||
running = false;
|
||||
|
||||
try {
|
||||
runnerThread.join();
|
||||
heartbeatStop();
|
||||
}
|
||||
catch (InterruptedException|SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat shutdown failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void run() {
|
||||
if (!running)
|
||||
running = true;
|
||||
else
|
||||
return;
|
||||
|
||||
try {
|
||||
while (running) {
|
||||
try {
|
||||
heartbeatUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("ServiceHeartbeat failed to update", ex);
|
||||
}
|
||||
|
||||
TimeUnit.SECONDS.sleep(heartbeatInterval);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
logger.error("ServiceHeartbeat caught irrecoverable exception, killing service", ex);
|
||||
System.exit(255);
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatInit() {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
INSERT INTO TASK_HEARTBEAT (TASK_NAME, TASK_BASE, NODE, INSTANCE, SERVICE_INSTANCE, HEARTBEAT_TIME, STATUS)
|
||||
VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP(6), 'STARTING')
|
||||
ON DUPLICATE KEY UPDATE
|
||||
INSTANCE = ?,
|
||||
SERVICE_INSTANCE = ?,
|
||||
HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'STARTING'
|
||||
"""
|
||||
))
|
||||
{
|
||||
stmt.setString(1, taskName);
|
||||
stmt.setString(2, taskBase);
|
||||
stmt.setInt(3, node);
|
||||
stmt.setString(4, instanceUUID);
|
||||
stmt.setString(5, serviceInstanceUUID);
|
||||
stmt.setString(6, instanceUUID);
|
||||
stmt.setString(7, serviceInstanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("ServiceHeartbeat failed to initialize", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void heartbeatUpdate() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS = 'RUNNING',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString(2, step);
|
||||
stmt.setString(3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void heartbeatStop() throws SQLException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement(
|
||||
"""
|
||||
UPDATE TASK_HEARTBEAT
|
||||
SET HEARTBEAT_TIME = CURRENT_TIMESTAMP(6),
|
||||
STATUS='STOPPED',
|
||||
PROGRESS = ?,
|
||||
STAGE_NAME = ?
|
||||
WHERE INSTANCE = ?
|
||||
""")
|
||||
)
|
||||
{
|
||||
stmt.setInt(1, progress);
|
||||
stmt.setString( 2, step);
|
||||
stmt.setString( 3, instanceUUID);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
shutDown();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,4 +5,5 @@ import com.google.inject.ImplementedBy;
|
||||
@ImplementedBy(ServiceHeartbeatImpl.class)
|
||||
public interface ServiceHeartbeat {
|
||||
<T extends Enum<T>> ServiceTaskHeartbeat<T> createServiceTaskHeartbeat(Class<T> steps, String processName);
|
||||
ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName);
|
||||
}
|
||||
|
@@ -54,6 +54,11 @@ public class ServiceHeartbeatImpl implements ServiceHeartbeat {
|
||||
return new ServiceTaskHeartbeatImpl<>(steps, configuration, processName, eventLog, dataSource);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServiceAdHocTaskHeartbeat createServiceAdHocTaskHeartbeat(String taskName) {
|
||||
return new ServiceAdHocTaskHeartbeatImpl(configuration, taskName, dataSource);
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
if (!running) {
|
||||
|
@@ -2,11 +2,8 @@ package nu.marginalia.service.discovery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
@@ -16,9 +13,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A versatile service registry that uses ZooKeeper to store service endpoints.
|
||||
* It is used to register services and to look up the endpoints of other services.
|
||||
* <p></p>
|
||||
@@ -37,18 +39,22 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
private final List<String> livenessPaths = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public ZkServiceRegistry(CuratorFramework curatorFramework) {
|
||||
this.curatorFramework = curatorFramework;
|
||||
try {
|
||||
this.curatorFramework = curatorFramework;
|
||||
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to start ZkServiceRegistry", ex);
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -59,8 +65,8 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
{
|
||||
var endpoint = new ServiceEndpoint(externalAddress, requestPort(externalAddress, key));
|
||||
|
||||
String path = STR."\{key.toPath()}/\{instanceUUID.toString()}";
|
||||
byte[] payload = STR."\{endpoint.host()}:\{endpoint.port()}".getBytes(StandardCharsets.UTF_8);
|
||||
String path = key.toPath() + "/" + instanceUUID.toString();
|
||||
byte[] payload = (endpoint.host() + ":" + endpoint.port()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
logger.info("Registering {} -> {}", path, endpoint);
|
||||
|
||||
@@ -72,14 +78,18 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void declareFirstBoot() {
|
||||
if (!isFirstBoot()) {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to declare first-boot", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +119,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
@Override
|
||||
public void announceInstance(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
|
||||
livenessPaths.add(serviceRoot);
|
||||
|
||||
@@ -128,7 +138,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
*/
|
||||
public boolean isInstanceRunning(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
return null != curatorFramework.checkExists().forPath(serviceRoot);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -165,11 +175,11 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(STR."/port-registry/\{externalHost}/\{port}", payload);
|
||||
.forPath("/port-registry/" + externalHost + "/" + port, payload);
|
||||
return port;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error(STR."Still negotiating port for \{identifier}");
|
||||
logger.error("Still negotiating port for " + identifier);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -38,7 +38,7 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
|
||||
record Rest(String name) implements ServiceKey<ServicePartition.None> {
|
||||
public String toPath() {
|
||||
return STR."/services/rest/\{name}";
|
||||
return "/services/rest/" + name;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -51,10 +51,10 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
}
|
||||
record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> {
|
||||
public String baseName() {
|
||||
return STR."/services/grpc/\{name}";
|
||||
return "/services/grpc/" + name;
|
||||
}
|
||||
public String toPath() {
|
||||
return STR."/services/grpc/\{name}/\{partition.identifier()}";
|
||||
return "/services/grpc/" + name + "/" + partition.identifier();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -5,14 +5,12 @@ import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.flywaydb.core.Flyway;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -71,14 +69,12 @@ public class DatabaseModule extends AbstractModule {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Singleton
|
||||
@Provides
|
||||
public HikariDataSource provideConnection() {
|
||||
return getMariaDB();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private HikariDataSource getMariaDB() {
|
||||
var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY));
|
||||
|
||||
|
@@ -6,6 +6,9 @@ import nu.marginalia.service.ServiceId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.net.NetworkInterface;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
|
||||
@@ -69,6 +72,17 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
return configuredValue;
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("system.multiFace")) {
|
||||
try {
|
||||
String localNetworkIp = getLocalNetworkIP();
|
||||
if (null != localNetworkIp) {
|
||||
return localNetworkIp;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to get local network IP", ex);
|
||||
}
|
||||
}
|
||||
// If we're in docker, we'll use the hostname
|
||||
if (Boolean.getBoolean("service.useDockerHostname")) {
|
||||
return System.getenv("HOSTNAME");
|
||||
@@ -84,10 +98,41 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
private String getBindAddress() {
|
||||
String configuredValue = System.getProperty("service.bind-address");
|
||||
if (configuredValue != null) {
|
||||
logger.info("Using configured bind address {}", configuredValue);
|
||||
return configuredValue;
|
||||
}
|
||||
|
||||
return "127.0.0.1";
|
||||
if (Boolean.getBoolean("system.multiFace")) {
|
||||
try {
|
||||
return Objects.requireNonNullElse(getLocalNetworkIP(), "0.0.0.0");
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to get local network IP, falling back to bind to 0.0.0.0", ex);
|
||||
return "0.0.0.0";
|
||||
}
|
||||
}
|
||||
else {
|
||||
return "0.0.0.0";
|
||||
}
|
||||
}
|
||||
|
||||
public static String getLocalNetworkIP() throws Exception {
|
||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||
|
||||
while (nets.hasMoreElements()) {
|
||||
NetworkInterface netif = nets.nextElement();
|
||||
if (!netif.isUp() || netif.isLoopback()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||
while (inetAddresses.hasMoreElements()) {
|
||||
InetAddress addr = inetAddresses.nextElement();
|
||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||
return addr.getHostAddress();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -59,13 +58,17 @@ public class Initialization {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean waitReady() {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
try {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
throw new RuntimeException("Interrupted while waiting for initialization", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,22 +2,23 @@ package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.exporter.MetricsServlet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
import org.eclipse.jetty.servlet.ServletHolder;
|
||||
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
||||
Server server = new Server(configuration.metricsPort());
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
|
@@ -1,10 +1,10 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.slf4j.Logger;
|
||||
@@ -57,7 +57,9 @@ public class NodeStatusWatcher {
|
||||
|
||||
private void setupNode() {
|
||||
try {
|
||||
configurationService.create(nodeId, "Node " + nodeId, true, false);
|
||||
NodeProfile profile = NodeProfile.MIXED;
|
||||
|
||||
configurationService.create(nodeId, "Node " + nodeId, true, false, profile);
|
||||
|
||||
fileStorageService.createStorageBase("Index Data", Path.of("/idx"), nodeId, FileStorageBaseType.CURRENT);
|
||||
fileStorageService.createStorageBase("Index Backups", Path.of("/backup"), nodeId, FileStorageBaseType.BACKUP);
|
||||
@@ -81,10 +83,14 @@ public class NodeStatusWatcher {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isConfigured() {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
try {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Look for changes in the configuration and kill the service if the corresponding
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.prometheus.client.Counter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -44,11 +43,10 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
@SneakyThrows
|
||||
public Service(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
|
||||
this.initialization = params.initialization;
|
||||
var config = params.configuration;
|
||||
@@ -130,14 +128,14 @@ public class Service {
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) {
|
||||
public Service(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
|
@@ -1,20 +1,18 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class StaticResources {
|
||||
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
|
||||
|
||||
@SneakyThrows
|
||||
public void serveStatic(String domain, String path, Request req, Response rsp) {
|
||||
try {
|
||||
if (path.startsWith("..") || domain.startsWith("..")) {
|
||||
@@ -28,7 +26,7 @@ public class StaticResources {
|
||||
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException | FileNotFoundException ex) {
|
||||
catch (IllegalArgumentException | IOException ex) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
}
|
||||
@@ -57,7 +55,6 @@ public class StaticResources {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
|
||||
rsp.header("Cache-Control", "public,max-age=3600");
|
||||
rsp.type(MimeType.fromResource(resource));
|
||||
|
@@ -24,7 +24,7 @@ public class NamedExecutorFactory {
|
||||
|
||||
@Override
|
||||
public Thread newThread(@NotNull Runnable r) {
|
||||
var thread = new Thread(r, STR."\{name}[\{threadNumber.getAndIncrement()}]");
|
||||
var thread = new Thread(r, name + "[" + threadNumber.getAndIncrement() + "]");
|
||||
thread.setDaemon(true);
|
||||
return thread;
|
||||
}
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.test.TestApiGrpc;
|
||||
import org.apache.curator.framework.CuratorFrameworkFactory;
|
||||
import org.apache.curator.retry.ExponentialBackoffRetry;
|
||||
@@ -33,7 +33,7 @@ class ZkServiceRegistryTest {
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
zookeeper.start();
|
||||
connectString = STR."\{zookeeper.getHost()}:\{zookeeper.getMappedPort(ZOOKEEPER_PORT)}";
|
||||
connectString = zookeeper.getHost() + ":" + zookeeper.getMappedPort(ZOOKEEPER_PORT);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
|
@@ -9,23 +9,25 @@ import nu.marginalia.executor.storage.FileStorageFile;
|
||||
import nu.marginalia.executor.upload.UploadDirContents;
|
||||
import nu.marginalia.executor.upload.UploadDirItem;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.*;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.*;
|
||||
import static nu.marginalia.functions.execution.api.ExecutorApiGrpc.ExecutorApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorClient {
|
||||
@@ -163,8 +165,8 @@ public class ExecutorClient {
|
||||
* The endpoint is compatible with range requests.
|
||||
* */
|
||||
public URL remoteFileURL(FileStorage fileStorage, String path) {
|
||||
String uriPath = STR."/transfer/file/\{fileStorage.id()}";
|
||||
String uriQuery = STR."path=\{URLEncoder.encode(path, StandardCharsets.UTF_8)}";
|
||||
String uriPath = "/transfer/file/" + fileStorage.id();
|
||||
String uriQuery = "path=" + URLEncoder.encode(path, StandardCharsets.UTF_8);
|
||||
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
@@ -180,4 +182,10 @@ public class ExecutorClient {
|
||||
}
|
||||
}
|
||||
|
||||
public void restartExecutorService(int node) {
|
||||
channelPool.call(ExecutorApiBlockingStub::restartExecutorService)
|
||||
.forNode(node)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.executor.client;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@@ -11,6 +12,8 @@ import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
import static nu.marginalia.functions.execution.api.ExecutorExportApiGrpc.ExecutorExportApiBlockingStub;
|
||||
|
||||
@Singleton
|
||||
@@ -18,23 +21,33 @@ public class ExecutorExportClient {
|
||||
private final GrpcMultiNodeChannelPool<ExecutorExportApiBlockingStub> channelPool;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorExportClient.class);
|
||||
|
||||
private final MqPersistence persistence;
|
||||
@Inject
|
||||
public ExecutorExportClient(GrpcChannelPoolFactory grpcChannelPoolFactory)
|
||||
public ExecutorExportClient(GrpcChannelPoolFactory grpcChannelPoolFactory, MqPersistence persistence)
|
||||
{
|
||||
this.channelPool = grpcChannelPoolFactory
|
||||
.createMulti(
|
||||
ServiceKey.forGrpcApi(ExecutorExportApiGrpc.class, ServicePartition.multi()),
|
||||
ExecutorExportApiGrpc::newBlockingStub);
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
long createTrackingTokenMsg(String task, int node, Duration ttl) throws Exception {
|
||||
return persistence.sendNewMessage("task-tracking[" + node + "]", "export-client", null, task, "", ttl);
|
||||
}
|
||||
|
||||
public long exportAtags(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("atags", node, Duration.ofHours(6));
|
||||
|
||||
public void exportAtags(int node, FileStorageId fid) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAtags)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportSampleData(int node, FileStorageId fid, int size, String name) {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportSampleData)
|
||||
.forNode(node)
|
||||
@@ -45,20 +58,26 @@ public class ExecutorExportClient {
|
||||
.build());
|
||||
}
|
||||
|
||||
public void exportRssFeeds(int node, FileStorageId fid) {
|
||||
public long exportRssFeeds(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("rss", node, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportRssFeeds)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportTermFrequencies(int node, FileStorageId fid) {
|
||||
public long exportTermFrequencies(int node, FileStorageId fid) throws Exception {
|
||||
long msgId = createTrackingTokenMsg("tfreq", node, Duration.ofHours(6));
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportTermFrequencies)
|
||||
.forNode(node)
|
||||
.run(RpcFileStorageId.newBuilder()
|
||||
.run(RpcExportRequest.newBuilder()
|
||||
.setFileStorageId(fid.id())
|
||||
.setMsgId(msgId)
|
||||
.build());
|
||||
return msgId;
|
||||
}
|
||||
|
||||
public void exportData(int node) {
|
||||
@@ -77,4 +96,21 @@ public class ExecutorExportClient {
|
||||
}
|
||||
|
||||
|
||||
public void exportAllAtags() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllAtags)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
public void exportAllFeeds() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllFeeds)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
|
||||
public void exportAllTfreqs() {
|
||||
channelPool.call(ExecutorExportApiBlockingStub::exportAllTfreqs)
|
||||
.forNode(1)
|
||||
.run(Empty.getDefaultInstance());
|
||||
}
|
||||
}
|
||||
|
@@ -17,6 +17,8 @@ service ExecutorApi {
|
||||
rpc downloadSampleData(RpcDownloadSampleData) returns (Empty) {}
|
||||
rpc calculateAdjacencies(Empty) returns (Empty) {}
|
||||
rpc restoreBackup(RpcFileStorageId) returns (Empty) {}
|
||||
|
||||
rpc restartExecutorService(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
service ExecutorCrawlApi {
|
||||
@@ -37,15 +39,20 @@ service ExecutorSideloadApi {
|
||||
}
|
||||
|
||||
service ExecutorExportApi {
|
||||
rpc exportAtags(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportAtags(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportSegmentationModel(RpcExportSegmentationModel) returns (Empty) {}
|
||||
rpc exportSampleData(RpcExportSampleData) returns (Empty) {}
|
||||
rpc exportRssFeeds(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportTermFrequencies(RpcFileStorageId) returns (Empty) {}
|
||||
rpc exportRssFeeds(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportTermFrequencies(RpcExportRequest) returns (Empty) {}
|
||||
rpc exportData(Empty) returns (Empty) {}
|
||||
|
||||
rpc exportAllAtags(Empty) returns (Empty) {}
|
||||
rpc exportAllFeeds(Empty) returns (Empty) {}
|
||||
rpc exportAllTfreqs(Empty) returns (Empty) {}
|
||||
}
|
||||
|
||||
message Empty {}
|
||||
|
||||
message RpcFsmName {
|
||||
string actorName = 1;
|
||||
}
|
||||
@@ -55,6 +62,10 @@ message RpcProcessId {
|
||||
message RpcFileStorageId {
|
||||
int64 fileStorageId = 1;
|
||||
}
|
||||
message RpcExportRequest {
|
||||
int64 fileStorageId = 1;
|
||||
int64 msgId = 2;
|
||||
}
|
||||
message RpcFileStorageIdWithDomainName {
|
||||
int64 fileStorageId = 1;
|
||||
string targetDomainName = 2;
|
||||
|
@@ -15,15 +15,15 @@ dependencies {
|
||||
// These look weird but they're needed to be able to spawn the processes
|
||||
// from the executor service
|
||||
|
||||
implementation project(':code:processes:website-adjacencies-calculator')
|
||||
implementation project(':code:processes:export-task-process')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:live-crawling-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:linkdb')
|
||||
|
||||
@@ -35,13 +35,13 @@ dependencies {
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:ft-link-parser')
|
||||
implementation project(':code:execution:data-extractors')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:processes:process-mq-api')
|
||||
|
@@ -1,7 +0,0 @@
|
||||
Contains converter-*like* extraction jobs that operate on crawled data to produce export files.
|
||||
|
||||
## Important classes
|
||||
|
||||
* [AtagExporter](java/nu/marginalia/extractor/AtagExporter.java) - extracts anchor texts from the crawled data.
|
||||
* [FeedExporter](java/nu/marginalia/extractor/FeedExporter.java) - tries to find RSS/Atom feeds within the crawled data.
|
||||
* [TermFrequencyExporter](java/nu/marginalia/extractor/TermFrequencyExporter.java) - exports the 'TF' part of TF-IDF.
|
@@ -1,28 +1,40 @@
|
||||
package nu.marginalia.actor;
|
||||
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
public enum ExecutorActor {
|
||||
CRAWL,
|
||||
RECRAWL,
|
||||
RECRAWL_SINGLE_DOMAIN,
|
||||
CONVERT_AND_LOAD,
|
||||
PROC_CONVERTER_SPAWNER,
|
||||
PROC_LOADER_SPAWNER,
|
||||
PROC_CRAWLER_SPAWNER,
|
||||
MONITOR_PROCESS_LIVENESS,
|
||||
MONITOR_FILE_STORAGE,
|
||||
ADJACENCY_CALCULATION,
|
||||
CRAWL_JOB_EXTRACTOR,
|
||||
EXPORT_DATA,
|
||||
EXPORT_SEGMENTATION_MODEL,
|
||||
EXPORT_ATAGS,
|
||||
EXPORT_TERM_FREQUENCIES,
|
||||
EXPORT_FEEDS,
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER,
|
||||
CONVERT,
|
||||
RESTORE_BACKUP,
|
||||
EXPORT_SAMPLE_DATA,
|
||||
DOWNLOAD_SAMPLE,
|
||||
SCRAPE_FEEDS;
|
||||
PREC_EXPORT_ALL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
CRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
RECRAWL_SINGLE_DOMAIN(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_CRAWLER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
PROC_EXPORT_TASKS_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
ADJACENCY_CALCULATION(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SEGMENTATION_MODEL(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_ATAGS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_TERM_FREQUENCIES(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_FEEDS(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
EXPORT_SAMPLE_DATA(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
DOWNLOAD_SAMPLE(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED),
|
||||
|
||||
PROC_CONVERTER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_LOADER_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
RESTORE_BACKUP(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
CONVERT(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
|
||||
CONVERT_AND_LOAD(NodeProfile.BATCH_CRAWL, NodeProfile.MIXED, NodeProfile.REALTIME, NodeProfile.SIDELOAD),
|
||||
MONITOR_PROCESS_LIVENESS(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
MONITOR_FILE_STORAGE(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
PROC_INDEX_CONSTRUCTOR_SPAWNER(NodeProfile.BATCH_CRAWL, NodeProfile.REALTIME, NodeProfile.MIXED, NodeProfile.SIDELOAD),
|
||||
|
||||
LIVE_CRAWL(NodeProfile.REALTIME),
|
||||
PROC_LIVE_CRAWL_SPAWNER(NodeProfile.REALTIME),
|
||||
SCRAPE_FEEDS(NodeProfile.REALTIME),
|
||||
UPDATE_RSS(NodeProfile.REALTIME);
|
||||
|
||||
public String id() {
|
||||
return "fsm:" + name().toLowerCase();
|
||||
@@ -32,4 +44,9 @@ public enum ExecutorActor {
|
||||
return "fsm:" + name().toLowerCase() + ":" + node;
|
||||
}
|
||||
|
||||
ExecutorActor(NodeProfile... profileSet) {
|
||||
this.profileSet = Set.of(profileSet);
|
||||
}
|
||||
|
||||
public Set<NodeProfile> profileSet;
|
||||
}
|
||||
|
@@ -2,8 +2,8 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.actor.monitor.FileStorageMonitorActor;
|
||||
import nu.marginalia.actor.precession.ExportAllPrecessionActor;
|
||||
import nu.marginalia.actor.proc.*;
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
@@ -11,9 +11,15 @@ import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
@@ -27,16 +33,24 @@ public class ExecutorActorControlService {
|
||||
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
||||
private final int node;
|
||||
|
||||
private final NodeConfiguration nodeConfiguration;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NodeConfigurationService configurationService,
|
||||
BaseServiceParams baseServiceParams,
|
||||
ConvertActor convertActor,
|
||||
ConvertAndLoadActor convertAndLoadActor,
|
||||
CrawlActor crawlActor,
|
||||
LiveCrawlActor liveCrawlActor,
|
||||
RecrawlSingleDomainActor recrawlSingleDomainActor,
|
||||
RestoreBackupActor restoreBackupActor,
|
||||
ConverterMonitorActor converterMonitorFSM,
|
||||
CrawlerMonitorActor crawlerMonitorActor,
|
||||
LiveCrawlerMonitorActor liveCrawlerMonitorActor,
|
||||
LoaderMonitorActor loaderMonitor,
|
||||
ProcessLivenessMonitorActor processMonitorFSM,
|
||||
FileStorageMonitorActor fileStorageMonitorActor,
|
||||
@@ -48,15 +62,21 @@ public class ExecutorActorControlService {
|
||||
ExportSampleDataActor exportSampleDataActor,
|
||||
ExportTermFreqActor exportTermFrequenciesActor,
|
||||
ExportSegmentationModelActor exportSegmentationModelActor,
|
||||
ExportTaskMonitorActor exportTasksMonitorActor,
|
||||
DownloadSampleActor downloadSampleActor,
|
||||
ScrapeFeedsActor scrapeFeedsActor,
|
||||
ExecutorActorStateMachines stateMachines) {
|
||||
ExecutorActorStateMachines stateMachines,
|
||||
ExportAllPrecessionActor exportAllPrecessionActor,
|
||||
UpdateRssActor updateRssActor) throws SQLException {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.eventLog = baseServiceParams.eventLog;
|
||||
this.stateMachines = stateMachines;
|
||||
this.node = baseServiceParams.configuration.node();
|
||||
|
||||
this.nodeConfiguration = configurationService.get(node);
|
||||
|
||||
register(ExecutorActor.CRAWL, crawlActor);
|
||||
register(ExecutorActor.LIVE_CRAWL, liveCrawlActor);
|
||||
register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor);
|
||||
|
||||
register(ExecutorActor.CONVERT, convertActor);
|
||||
@@ -67,6 +87,8 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.PROC_CONVERTER_SPAWNER, converterMonitorFSM);
|
||||
register(ExecutorActor.PROC_LOADER_SPAWNER, loaderMonitor);
|
||||
register(ExecutorActor.PROC_CRAWLER_SPAWNER, crawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_LIVE_CRAWL_SPAWNER, liveCrawlerMonitorActor);
|
||||
register(ExecutorActor.PROC_EXPORT_TASKS_SPAWNER, exportTasksMonitorActor);
|
||||
|
||||
register(ExecutorActor.MONITOR_PROCESS_LIVENESS, processMonitorFSM);
|
||||
register(ExecutorActor.MONITOR_FILE_STORAGE, fileStorageMonitorActor);
|
||||
@@ -83,9 +105,19 @@ public class ExecutorActorControlService {
|
||||
register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor);
|
||||
|
||||
register(ExecutorActor.SCRAPE_FEEDS, scrapeFeedsActor);
|
||||
register(ExecutorActor.UPDATE_RSS, updateRssActor);
|
||||
|
||||
if (serviceConfiguration.node() == 1) {
|
||||
register(ExecutorActor.PREC_EXPORT_ALL, exportAllPrecessionActor);
|
||||
}
|
||||
}
|
||||
|
||||
private void register(ExecutorActor process, RecordActorPrototype graph) {
|
||||
|
||||
if (!process.profileSet.contains(nodeConfiguration.profile())) {
|
||||
return;
|
||||
}
|
||||
|
||||
var sm = new ActorStateMachine(messageQueueFactory, process.id(), node, UUID.randomUUID(), graph);
|
||||
sm.listen((function, param) -> logStateChange(process, function));
|
||||
|
||||
@@ -117,11 +149,15 @@ public class ExecutorActorControlService {
|
||||
stateMachines.startFromJSON(process, state, json);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void stop(ExecutorActor process) {
|
||||
eventLog.logEvent("FSM-STOP", process.id());
|
||||
|
||||
stateMachines.stop(process);
|
||||
try {
|
||||
stateMachines.stop(process);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to stop FSM", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<ExecutorActor, ActorStateInstance> getActorStates() {
|
||||
|
@@ -0,0 +1,116 @@
|
||||
package nu.marginalia.actor.precession;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.executor.client.ExecutorExportClient;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
public class ExportAllPrecessionActor extends RecordActorPrototype {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final ExecutorExportClient exportClient;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
@Inject
|
||||
public ExportAllPrecessionActor(Gson gson,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
ExecutorExportClient exportClient,
|
||||
FileStorageService fileStorageService,
|
||||
MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.exportClient = exportClient;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
public enum ExportTask {
|
||||
FEEDS,
|
||||
ATAGS,
|
||||
TFREQ
|
||||
}
|
||||
|
||||
public record Initial(ExportTask task) implements ActorStep {}
|
||||
public record Export(int nodeId, ExportTask task, long msgId) implements ActorStep {
|
||||
public Export(int nodeId, ExportTask task) {
|
||||
this(nodeId, task, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial(ExportTask task) -> {
|
||||
var firstNode = nextNodeId(-1);
|
||||
if (firstNode.isEmpty())
|
||||
yield new Error("No nodes included in precession");
|
||||
else
|
||||
yield new Export(firstNode.get(), task);
|
||||
}
|
||||
|
||||
case Export(int nodeId, ExportTask task, long msgId) when msgId < 0 -> {
|
||||
var activeStorages = fileStorageService.getActiveFileStorages(nodeId, FileStorageType.CRAWL_DATA);
|
||||
if (activeStorages.isEmpty()) {
|
||||
yield new Error("Node " + nodeId + " has no active file storage");
|
||||
}
|
||||
var activeCrawlStorageId = activeStorages.getFirst();
|
||||
|
||||
long trackingMsgId = switch(task) {
|
||||
case ATAGS -> exportClient.exportAtags(nodeId, activeCrawlStorageId);
|
||||
case TFREQ -> exportClient.exportTermFrequencies(nodeId, activeCrawlStorageId);
|
||||
case FEEDS -> exportClient.exportRssFeeds(nodeId, activeCrawlStorageId);
|
||||
};
|
||||
|
||||
yield new Export(nodeId, task, trackingMsgId);
|
||||
}
|
||||
|
||||
case Export(int nodeId, ExportTask task, long msgId) -> {
|
||||
for (; ; ) {
|
||||
var msg = persistence.getMessage(msgId);
|
||||
if (!msg.state().isTerminal()) {
|
||||
Thread.sleep(Duration.ofSeconds(30));
|
||||
continue;
|
||||
}
|
||||
if (msg.state() == MqMessageState.OK) {
|
||||
var nextNode = nextNodeId(nodeId);
|
||||
if (nextNode.isEmpty()) {
|
||||
yield new End();
|
||||
} else {
|
||||
yield new Export(nextNode.get(), task);
|
||||
}
|
||||
} else {
|
||||
yield new Error("Export failed for node " + nodeId);
|
||||
}
|
||||
}
|
||||
}
|
||||
default -> new Error("Unknown state");
|
||||
};
|
||||
}
|
||||
|
||||
private Optional<Integer> nextNodeId(int currentNodeId) {
|
||||
return nodeConfigurationService.getAll()
|
||||
.stream().sorted(Comparator.comparing(NodeConfiguration::node))
|
||||
.filter(node -> node.node() > currentNodeId)
|
||||
.filter(NodeConfiguration::includeInPrecession)
|
||||
.map(NodeConfiguration::node)
|
||||
.findFirst();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Runs an export job on each index node included in the precession";
|
||||
}
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class ExportTaskMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public ExportTaskMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||
ProcessService.ProcessId.EXPORT_TASKS);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.monitor.AbstractProcessSpawnerActor;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
@Singleton
|
||||
public class LiveCrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||
|
||||
@Inject
|
||||
public LiveCrawlerMonitorActor(Gson gson,
|
||||
ServiceConfiguration configuration,
|
||||
MqPersistence persistence,
|
||||
ProcessService processService) {
|
||||
super(gson,
|
||||
configuration,
|
||||
persistence,
|
||||
processService,
|
||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||
ProcessService.ProcessId.LIVE_CRAWLER);
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -10,6 +10,8 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -39,6 +41,7 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
private final Duration pollInterval = Duration.ofHours(6);
|
||||
|
||||
private final ServiceEventLog eventLog;
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final HikariDataSource dataSource;
|
||||
private final int nodeId;
|
||||
|
||||
@@ -54,8 +57,8 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Initial() -> {
|
||||
if (nodeId > 1) {
|
||||
yield new End();
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
else {
|
||||
yield new Wait(LocalDateTime.now().toString());
|
||||
@@ -177,10 +180,12 @@ public class ScrapeFeedsActor extends RecordActorPrototype {
|
||||
public ScrapeFeedsActor(Gson gson,
|
||||
ServiceEventLog eventLog,
|
||||
ServiceConfiguration configuration,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
HikariDataSource dataSource)
|
||||
{
|
||||
super(gson);
|
||||
this.eventLog = eventLog;
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.dataSource = dataSource;
|
||||
this.nodeId = configuration.node();
|
||||
}
|
||||
|
141
code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
Normal file
141
code/execution/java/nu/marginalia/actor/proc/UpdateRssActor.java
Normal file
@@ -0,0 +1,141 @@
|
||||
package nu.marginalia.actor.proc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeedUpdateMode;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
public class UpdateRssActor extends RecordActorPrototype {
|
||||
|
||||
private final FeedsClient feedsClient;
|
||||
private final int nodeId;
|
||||
|
||||
private final Duration initialDelay = Duration.ofMinutes(5);
|
||||
private final Duration updateInterval = Duration.ofHours(24);
|
||||
private final int cleanInterval = 60;
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final MqPersistence persistence;
|
||||
|
||||
@Inject
|
||||
public UpdateRssActor(Gson gson,
|
||||
FeedsClient feedsClient,
|
||||
ServiceConfiguration serviceConfiguration,
|
||||
NodeConfigurationService nodeConfigurationService,
|
||||
MqPersistence persistence) {
|
||||
super(gson);
|
||||
this.feedsClient = feedsClient;
|
||||
this.nodeId = serviceConfiguration.node();
|
||||
this.nodeConfigurationService = nodeConfigurationService;
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record Wait(String ts, int refreshCount) implements ActorStep {}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record UpdateRefresh(int refreshCount, long msgId) implements ActorStep {
|
||||
public UpdateRefresh(int refreshCount) {
|
||||
this(refreshCount, -1);
|
||||
}
|
||||
}
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
public record UpdateClean(long msgId) implements ActorStep {
|
||||
public UpdateClean() {
|
||||
this(-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
if (nodeConfigurationService.get(nodeId).profile() != NodeProfile.REALTIME) {
|
||||
yield new Error("Invalid node profile for RSS update");
|
||||
}
|
||||
else {
|
||||
// Wait for 5 minutes before starting the first update, to give the system time to start up properly
|
||||
yield new Wait(LocalDateTime.now().plus(initialDelay).toString(), 0);
|
||||
}
|
||||
}
|
||||
case Wait(String untilTs, int count) -> {
|
||||
var until = LocalDateTime.parse(untilTs);
|
||||
var now = LocalDateTime.now();
|
||||
|
||||
long remaining = Duration.between(now, until).toMillis();
|
||||
|
||||
if (remaining > 0) {
|
||||
Thread.sleep(remaining);
|
||||
yield new Wait(untilTs, count);
|
||||
}
|
||||
else {
|
||||
|
||||
// Once every `cleanInterval` updates, do a clean update;
|
||||
// otherwise do a refresh update
|
||||
if (count > cleanInterval) {
|
||||
yield new UpdateClean();
|
||||
}
|
||||
else {
|
||||
yield new UpdateRefresh(count);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
case UpdateRefresh(int count, long msgId) when msgId < 0 -> {
|
||||
long messageId = feedsClient.updateFeeds(RpcFeedUpdateMode.REFRESH);
|
||||
yield new UpdateRefresh(count, messageId);
|
||||
}
|
||||
case UpdateRefresh(int count, long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: message not found");
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
}
|
||||
else {
|
||||
// Increment the refresh count
|
||||
yield new Wait(LocalDateTime.now().plus(updateInterval).toString(), count + 1);
|
||||
}
|
||||
}
|
||||
case UpdateClean(long msgId) when msgId < 0 -> {
|
||||
long messageId = feedsClient.updateFeeds(RpcFeedUpdateMode.CLEAN);
|
||||
yield new UpdateClean(messageId);
|
||||
}
|
||||
case UpdateClean(long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: message not found");
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
}
|
||||
else {
|
||||
// Reset the refresh count after a successful update
|
||||
yield new Wait(LocalDateTime.now().plus(updateInterval).toString(), 0);
|
||||
}
|
||||
}
|
||||
default -> new Error("Unknown actor step: " + self);
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Periodically updates RSS and Atom feeds";
|
||||
}
|
||||
}
|
@@ -8,6 +8,9 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||
@@ -17,9 +20,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -140,7 +140,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||
// re-use the predigested database file.
|
||||
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
|
||||
yield new PredigestEncyclopedia(source, source + "." + hash + ".db", baseUrl);
|
||||
} else if (!source.endsWith(".db")) {
|
||||
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
|
||||
}
|
||||
|
@@ -3,9 +3,6 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.With;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
@@ -40,7 +37,6 @@ import java.util.List;
|
||||
public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
|
||||
// STATES
|
||||
|
||||
public static final String RERANK = "RERANK";
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqConverterOutbox;
|
||||
@@ -54,15 +50,6 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
private final int nodeId;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
||||
@AllArgsConstructor @With @NoArgsConstructor
|
||||
public static class Message {
|
||||
public FileStorageId crawlStorageId = null;
|
||||
public List<FileStorageId> processedStorageId = null;
|
||||
public long converterMsgId = 0L;
|
||||
public long loaderMsgId = 0L;
|
||||
}
|
||||
|
||||
public record Initial(FileStorageId fid) implements ActorStep {}
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
|
@@ -3,50 +3,78 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.extractor.AtagExporter;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.storage.model.*;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportAtagsActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ExporterIf atagExporter;
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Export(long responseMsgId, FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(long responseMsgId,FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||
public Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId) {
|
||||
this(responseMsgId, crawlId, destId, -1);
|
||||
}
|
||||
}
|
||||
public record Fail(long responseMsgId, String message) implements ActorStep {}
|
||||
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atag-export", "Anchor Tags " + LocalDateTime.now());
|
||||
case Export(long responseMsgId, FileStorageId crawlId) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ACK);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "atags-export", "Atags " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Fail(responseMsgId, "Bad storage id");
|
||||
|
||||
yield new Run(responseMsgId, crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
atagExporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.atags(crawlId, destId));
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
yield new End();
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Fail(responseMsgId, "Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
case Fail(long responseMsgId, String message) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ERR);
|
||||
yield new Error(message);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export anchor tags from crawl data";
|
||||
@@ -55,11 +83,15 @@ public class ExportAtagsActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public ExportAtagsActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
AtagExporter atagExporter)
|
||||
ProcessOutboxes processOutboxes,
|
||||
MqPersistence persistence,
|
||||
ActorProcessWatcher processWatcher)
|
||||
{
|
||||
super(gson);
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
this.storageService = storageService;
|
||||
this.atagExporter = atagExporter;
|
||||
this.persistence = persistence;
|
||||
this.processWatcher = processWatcher;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,8 +5,12 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.FeedExporter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -19,33 +23,52 @@ import java.time.LocalDateTime;
|
||||
@Singleton
|
||||
public class ExportFeedsActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final MqPersistence persistence;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final ExporterIf feedExporter;
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
public record Export(long responseMsgId, FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||
public Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId) {
|
||||
this(responseMsgId, crawlId, destId, -1);
|
||||
}
|
||||
}
|
||||
public record Fail(long responseMsgId, String message) implements ActorStep {}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
case Export(long responseMsgId, FileStorageId crawlId) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ACK);
|
||||
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "feed-export", "Feeds " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
if (storage == null) yield new Fail(responseMsgId, "Bad storage id");
|
||||
yield new Run(responseMsgId, crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
feedExporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.feeds(crawlId, destId));
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
yield new End();
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Fail(responseMsgId, "Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
case Fail(long responseMsgId, String message) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ERR);
|
||||
yield new Error(message);
|
||||
}
|
||||
default -> new Error();
|
||||
};
|
||||
@@ -60,11 +83,14 @@ public class ExportFeedsActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public ExportFeedsActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
FeedExporter feedExporter)
|
||||
ActorProcessWatcher processWatcher,
|
||||
ProcessOutboxes outboxes, MqPersistence persistence)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.feedExporter = feedExporter;
|
||||
this.processWatcher = processWatcher;
|
||||
this.exportTasksOutbox = outboxes.getExportTasksOutbox();
|
||||
this.persistence = persistence;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,7 +5,11 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.SampleDataExporter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
@@ -18,45 +22,52 @@ import java.time.LocalDateTime;
|
||||
@Singleton
|
||||
public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final SampleDataExporter dataExporter;
|
||||
public record Export(FileStorageId crawlId, int size, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) implements ActorStep {
|
||||
public Run(FileStorageId crawlId, FileStorageId destId, int size, String name) {
|
||||
this(crawlId, destId, size, name, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}"
|
||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||
);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id(), size, name);
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name) -> {
|
||||
case Run(FileStorageId crawlId, FileStorageId destId, int size, String name, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
dataExporter.export(crawlId, destId, size, name);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
|
||||
logger.error("Failed to export data", ex);
|
||||
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
|
||||
yield new End();
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.sampleData(crawlId, destId, size, name));
|
||||
yield new Run(crawlId, destId, size, name, newMsgId);
|
||||
}
|
||||
case Run(_, FileStorageId destId, _, _, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Error("Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Export RSS/Atom feeds from crawl data";
|
||||
@@ -65,11 +76,13 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public ExportSampleDataActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
SampleDataExporter dataExporter)
|
||||
ProcessOutboxes processOutboxes,
|
||||
ActorProcessWatcher processWatcher)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.dataExporter = dataExporter;
|
||||
this.processWatcher = processWatcher;
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,45 +5,70 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.extractor.ExporterIf;
|
||||
import nu.marginalia.extractor.TermFrequencyExporter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
|
||||
@Singleton
|
||||
public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
private final FileStorageService storageService;
|
||||
private final ExporterIf exporter;
|
||||
public record Export(FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(FileStorageId crawlId, FileStorageId destId) implements ActorStep {}
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MqPersistence persistence;
|
||||
|
||||
public record Export(long responseMsgId, FileStorageId crawlId) implements ActorStep {}
|
||||
public record Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) implements ActorStep {
|
||||
public Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId) {
|
||||
this(responseMsgId, crawlId, destId, -1);
|
||||
}
|
||||
}
|
||||
public record Fail(long responseMsgId, String message) implements ActorStep {}
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch(self) {
|
||||
case Export(FileStorageId crawlId) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq-export", "Term Frequencies " + LocalDateTime.now());
|
||||
case Export(long responseMsgId, FileStorageId crawlId) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ACK);
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT, "term-freq", "Term Frequencies " + LocalDateTime.now());
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
yield new Run(crawlId, storage.id());
|
||||
if (storage == null) yield new Fail(responseMsgId, "Bad storage id");
|
||||
yield new Run(responseMsgId, crawlId, storage.id());
|
||||
}
|
||||
case Run(FileStorageId crawlId, FileStorageId destId) -> {
|
||||
case Run(long responseMsgId, FileStorageId crawlId, FileStorageId destId, long msgId) when msgId < 0 -> {
|
||||
storageService.setFileStorageState(destId, FileStorageState.NEW);
|
||||
|
||||
try {
|
||||
exporter.export(crawlId, destId);
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
storageService.setFileStorageState(destId, FileStorageState.DELETE);
|
||||
yield new Error("Failed to export data");
|
||||
}
|
||||
|
||||
yield new End();
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.termFreq(crawlId, destId));
|
||||
yield new Run(responseMsgId, crawlId, destId, newMsgId);
|
||||
}
|
||||
case Run(long responseMsgId, _, FileStorageId destId, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
storageService.flagFileForDeletion(destId);
|
||||
yield new Fail(responseMsgId, "Exporter failed");
|
||||
}
|
||||
else {
|
||||
storageService.setFileStorageState(destId, FileStorageState.UNSET);
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.OK);
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
case Fail(long responseMsgId, String message) -> {
|
||||
persistence.updateMessageState(responseMsgId, MqMessageState.ERR);
|
||||
yield new Error(message);
|
||||
}
|
||||
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
@@ -57,11 +82,15 @@ public class ExportTermFreqActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public ExportTermFreqActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
TermFrequencyExporter exporter)
|
||||
ProcessOutboxes processOutboxes,
|
||||
MqPersistence persistence,
|
||||
ActorProcessWatcher processWatcher)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.exporter = exporter;
|
||||
this.persistence = persistence;
|
||||
this.processWatcher = processWatcher;
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
}
|
||||
|
||||
}
|
||||
|
114
code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
Normal file
114
code/execution/java/nu/marginalia/actor/task/LiveCrawlActor.java
Normal file
@@ -0,0 +1,114 @@
|
||||
package nu.marginalia.actor.task;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorStateMachines;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
|
||||
@Singleton
|
||||
public class LiveCrawlActor extends RecordActorPrototype {
|
||||
|
||||
// STATES
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqLiveCrawlerOutbox;
|
||||
private final ExecutorActorStateMachines executorActorStateMachines;
|
||||
private final FeedsClient feedsClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Monitor(String feedsHash) implements ActorStep {}
|
||||
public record LiveCrawl(String feedsHash, long msgId) implements ActorStep {
|
||||
public LiveCrawl(String feedsHash) { this(feedsHash, -1); }
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
logger.info("{}", self);
|
||||
return switch (self) {
|
||||
case Initial() -> {
|
||||
yield new Monitor("-");
|
||||
}
|
||||
case Monitor(String feedsHash) -> {
|
||||
// Sleep initially in case this is during start-up
|
||||
for (;;) {
|
||||
try {
|
||||
Thread.sleep(Duration.ofMinutes(15));
|
||||
String currentHash = feedsClient.getFeedDataHash();
|
||||
if (!Objects.equals(currentHash, feedsHash)) {
|
||||
yield new LiveCrawl(currentHash);
|
||||
}
|
||||
}
|
||||
catch (RuntimeException ex) {
|
||||
logger.error("Failed to fetch feed data hash");
|
||||
}
|
||||
}
|
||||
}
|
||||
case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> {
|
||||
// Clear the index journal before starting the crawl
|
||||
Path indexJournalLocation = IndexLocations.getIndexConstructionArea(fileStorageService).resolve("index-journal");
|
||||
if (Files.isDirectory(indexJournalLocation)) {
|
||||
FileUtils.deleteDirectory(indexJournalLocation.toFile());
|
||||
}
|
||||
|
||||
long id = mqLiveCrawlerOutbox.sendAsync(new LiveCrawlRequest());
|
||||
yield new LiveCrawl(feedsHash, id);
|
||||
}
|
||||
case LiveCrawl(String feedsHash, long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(mqLiveCrawlerOutbox, ProcessService.ProcessId.LIVE_CRAWLER, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Crawler failed");
|
||||
}
|
||||
|
||||
// Build the index
|
||||
executorActorStateMachines.initFrom(ExecutorActor.CONVERT_AND_LOAD, new ConvertAndLoadActor.Rerank());
|
||||
|
||||
yield new Monitor(feedsHash);
|
||||
}
|
||||
default -> new Error("Unknown state");
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String describe() {
|
||||
return "Actor that polls the feeds database for changes, and triggers the live crawler when needed";
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LiveCrawlActor(ActorProcessWatcher processWatcher,
|
||||
ProcessOutboxes processOutboxes,
|
||||
FeedsClient feedsClient,
|
||||
Gson gson,
|
||||
ExecutorActorStateMachines executorActorStateMachines, FileStorageService fileStorageService)
|
||||
{
|
||||
super(gson);
|
||||
this.processWatcher = processWatcher;
|
||||
this.mqLiveCrawlerOutbox = processOutboxes.getLiveCrawlerOutbox();
|
||||
this.executorActorStateMachines = executorActorStateMachines;
|
||||
this.feedsClient = feedsClient;
|
||||
this.fileStorageService = fileStorageService;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -5,53 +5,59 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.tasks.ExportTaskRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
@Singleton
|
||||
public class TriggerAdjacencyCalculationActor extends RecordActorPrototype {
|
||||
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final ProcessService processService;
|
||||
private final ExecutorService executor = Executors.newSingleThreadExecutor();
|
||||
|
||||
public record Run() implements ActorStep {}
|
||||
public record Run(long msgId) implements ActorStep {
|
||||
public Run() {
|
||||
this(-1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActorStep transition(ActorStep self) throws Exception {
|
||||
return switch (self) {
|
||||
case Run() -> {
|
||||
AtomicBoolean hasError = new AtomicBoolean(false);
|
||||
var future = executor.submit(() -> {
|
||||
try {
|
||||
processService.trigger(ProcessService.ProcessId.ADJACENCIES_CALCULATOR, "load");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error triggering adjacency calculation", ex);
|
||||
hasError.set(true);
|
||||
}
|
||||
});
|
||||
future.get();
|
||||
|
||||
if (hasError.get()) {
|
||||
yield new Error("Error triggering adjacency calculation");
|
||||
}
|
||||
yield new End();
|
||||
return switch(self) {
|
||||
case Run(long msgId) when msgId < 0 -> {
|
||||
long newMsgId = exportTasksOutbox.sendAsync(ExportTaskRequest.adjacencies());
|
||||
yield new Run(newMsgId);
|
||||
}
|
||||
case Run(long msgId) -> {
|
||||
var rsp = processWatcher.waitResponse(exportTasksOutbox, ProcessService.ProcessId.EXPORT_TASKS, msgId);
|
||||
|
||||
if (rsp.state() != MqMessageState.OK) {
|
||||
yield new Error("Exporter failed");
|
||||
}
|
||||
else {
|
||||
yield new End();
|
||||
}
|
||||
}
|
||||
|
||||
default -> new Error();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
public TriggerAdjacencyCalculationActor(Gson gson,
|
||||
ProcessService processService) {
|
||||
ProcessOutboxes processOutboxes,
|
||||
ActorProcessWatcher processWatcher) {
|
||||
super(gson);
|
||||
this.processService = processService;
|
||||
|
||||
this.exportTasksOutbox = processOutboxes.getExportTasksOutbox();
|
||||
this.processWatcher = processWatcher;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -5,8 +5,10 @@ import com.google.inject.Singleton;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorControlService;
|
||||
import nu.marginalia.actor.precession.ExportAllPrecessionActor;
|
||||
import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
@@ -16,17 +18,21 @@ public class ExecutorExportGrpcService
|
||||
implements DiscoverableService
|
||||
{
|
||||
private final ExecutorActorControlService actorControlService;
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
|
||||
@Inject
|
||||
public ExecutorExportGrpcService(ExecutorActorControlService actorControlService) {
|
||||
public ExecutorExportGrpcService(ExecutorActorControlService actorControlService, ServiceConfiguration serviceConfiguration) {
|
||||
this.actorControlService = actorControlService;
|
||||
this.serviceConfiguration = serviceConfiguration;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAtags(RpcFileStorageId request, StreamObserver<Empty> responseObserver) {
|
||||
public void exportAtags(RpcExportRequest request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_ATAGS,
|
||||
new ExportAtagsActor.Export(FileStorageId.of(request.getFileStorageId()))
|
||||
new ExportAtagsActor.Export(
|
||||
request.getMsgId(),
|
||||
FileStorageId.of(request.getFileStorageId()))
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
@@ -55,10 +61,12 @@ public class ExecutorExportGrpcService
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportRssFeeds(RpcFileStorageId request, StreamObserver<Empty> responseObserver) {
|
||||
public void exportRssFeeds(RpcExportRequest request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_FEEDS,
|
||||
new ExportFeedsActor.Export(FileStorageId.of(request.getFileStorageId()))
|
||||
new ExportFeedsActor.Export(
|
||||
request.getMsgId(),
|
||||
FileStorageId.of(request.getFileStorageId()))
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
@@ -69,10 +77,10 @@ public class ExecutorExportGrpcService
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportTermFrequencies(RpcFileStorageId request, StreamObserver<Empty> responseObserver) {
|
||||
public void exportTermFrequencies(RpcExportRequest request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.EXPORT_TERM_FREQUENCIES,
|
||||
new ExportTermFreqActor.Export(FileStorageId.of(request.getFileStorageId()))
|
||||
new ExportTermFreqActor.Export(request.getMsgId(), FileStorageId.of(request.getFileStorageId()))
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
@@ -109,4 +117,48 @@ public class ExecutorExportGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllAtags(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
if (serviceConfiguration.node() != 1) {
|
||||
responseObserver.onError(new IllegalArgumentException("Export all atags is only available on node 1"));
|
||||
}
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
new ExportAllPrecessionActor.Initial(ExportAllPrecessionActor.ExportTask.ATAGS)
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllFeeds(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
new ExportAllPrecessionActor.Initial(ExportAllPrecessionActor.ExportTask.FEEDS)
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void exportAllTfreqs(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
try {
|
||||
actorControlService.startFrom(ExecutorActor.PREC_EXPORT_ALL,
|
||||
new ExportAllPrecessionActor.Initial(ExportAllPrecessionActor.ExportTask.TFREQ)
|
||||
);
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
catch (Exception e) {
|
||||
responseObserver.onError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,7 +2,6 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
@@ -16,9 +15,12 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
@@ -33,6 +35,8 @@ public class ExecutorGrpcService
|
||||
private final ServiceConfiguration serviceConfiguration;
|
||||
private final ExecutorActorControlService actorControlService;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExecutorGrpcService.class);
|
||||
|
||||
@Inject
|
||||
public ExecutorGrpcService(ActorApi actorApi,
|
||||
FileStorageService fileStorageService,
|
||||
@@ -228,14 +232,35 @@ public class ExecutorGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private RpcFileStorageEntry createFileModel(Path path) {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
try {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void restartExecutorService(Empty request, StreamObserver<Empty> responseObserver) {
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
|
||||
logger.info("Restarting executor service on node {}", serviceConfiguration.node());
|
||||
|
||||
try {
|
||||
// Wait for the response to be sent before restarting
|
||||
Thread.sleep(Duration.ofSeconds(5));
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
logger.warn("Interrupted while waiting for restart", e);
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -13,6 +13,8 @@ public class ProcessOutboxes {
|
||||
private final MqOutbox loaderOutbox;
|
||||
private final MqOutbox crawlerOutbox;
|
||||
private final MqOutbox indexConstructorOutbox;
|
||||
private final MqOutbox liveCrawlerOutbox;
|
||||
private final MqOutbox exportTasksOutbox;
|
||||
|
||||
@Inject
|
||||
public ProcessOutboxes(BaseServiceParams params, MqPersistence persistence) {
|
||||
@@ -44,6 +46,22 @@ public class ProcessOutboxes {
|
||||
params.configuration.node(),
|
||||
params.configuration.instanceUuid()
|
||||
);
|
||||
|
||||
liveCrawlerOutbox = new MqOutbox(persistence,
|
||||
ProcessInboxNames.LIVE_CRAWLER_INBOX,
|
||||
params.configuration.node(),
|
||||
params.configuration.serviceName(),
|
||||
params.configuration.node(),
|
||||
params.configuration.instanceUuid()
|
||||
);
|
||||
|
||||
exportTasksOutbox = new MqOutbox(persistence,
|
||||
ProcessInboxNames.EXPORT_TASK_INBOX,
|
||||
params.configuration.node(),
|
||||
params.configuration.serviceName(),
|
||||
params.configuration.node(),
|
||||
params.configuration.instanceUuid()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -60,4 +78,8 @@ public class ProcessOutboxes {
|
||||
}
|
||||
|
||||
public MqOutbox getIndexConstructorOutbox() { return indexConstructorOutbox; }
|
||||
|
||||
public MqOutbox getLiveCrawlerOutbox() { return liveCrawlerOutbox; }
|
||||
|
||||
public MqOutbox getExportTasksOutbox() { return exportTasksOutbox; }
|
||||
}
|
||||
|
@@ -3,14 +3,14 @@ package nu.marginalia.process;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.adjacencies.WebsiteAdjacenciesCalculator;
|
||||
import nu.marginalia.converting.ConverterMain;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.index.IndexConstructorMain;
|
||||
import nu.marginalia.livecrawler.LiveCrawlerMain;
|
||||
import nu.marginalia.loading.LoaderMain;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.task.ExportTasksMain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
@@ -37,23 +37,24 @@ public class ProcessService {
|
||||
private final int node;
|
||||
|
||||
|
||||
public static ProcessService.ProcessId translateExternalIdBase(String id) {
|
||||
public static ProcessId translateExternalIdBase(String id) {
|
||||
return switch (id) {
|
||||
case "converter" -> ProcessService.ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessService.ProcessId.CRAWLER;
|
||||
case "loader" -> ProcessService.ProcessId.LOADER;
|
||||
case "website-adjacencies-calculator" -> ProcessService.ProcessId.ADJACENCIES_CALCULATOR;
|
||||
case "index-constructor" -> ProcessService.ProcessId.INDEX_CONSTRUCTOR;
|
||||
case "converter" -> ProcessId.CONVERTER;
|
||||
case "crawler" -> ProcessId.CRAWLER;
|
||||
case "loader" -> ProcessId.LOADER;
|
||||
case "export-tasks" -> ProcessId.EXPORT_TASKS;
|
||||
case "index-constructor" -> ProcessId.INDEX_CONSTRUCTOR;
|
||||
default -> null;
|
||||
};
|
||||
}
|
||||
|
||||
public enum ProcessId {
|
||||
CRAWLER(CrawlerMain.class),
|
||||
LIVE_CRAWLER(LiveCrawlerMain.class),
|
||||
CONVERTER(ConverterMain.class),
|
||||
LOADER(LoaderMain.class),
|
||||
INDEX_CONSTRUCTOR(IndexConstructorMain.class),
|
||||
ADJACENCIES_CALCULATOR(WebsiteAdjacenciesCalculator.class)
|
||||
EXPORT_TASKS(ExportTasksMain.class),
|
||||
;
|
||||
|
||||
public final String mainClass;
|
||||
@@ -64,10 +65,11 @@ public class ProcessService {
|
||||
List<String> envOpts() {
|
||||
String variable = switch (this) {
|
||||
case CRAWLER -> "CRAWLER_PROCESS_OPTS";
|
||||
case LIVE_CRAWLER -> "LIVE_CRAWLER_PROCESS_OPTS";
|
||||
case CONVERTER -> "CONVERTER_PROCESS_OPTS";
|
||||
case LOADER -> "LOADER_PROCESS_OPTS";
|
||||
case INDEX_CONSTRUCTOR -> "INDEX_CONSTRUCTION_PROCESS_OPTS";
|
||||
case ADJACENCIES_CALCULATOR -> "ADJACENCIES_CALCULATOR_PROCESS_OPTS";
|
||||
case EXPORT_TASKS -> "EXPORT_TASKS_PROCESS_OPTS";
|
||||
};
|
||||
String value = System.getenv(variable);
|
||||
|
||||
|
@@ -9,7 +9,8 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.*;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -97,7 +98,7 @@ public class RedditSideloadHelper {
|
||||
|
||||
private static Path getRedditDbPath(RedditFilePair pair) throws IOException {
|
||||
String hash = SideloadHelper.getCrc32FileHash(pair.commentsPath());
|
||||
return pair.rootDir().resolve(STR."\{pair.fileNameBase}.\{hash}.db");
|
||||
return pair.rootDir().resolve(pair.fileNameBase + "." + hash + ".db");
|
||||
}
|
||||
|
||||
}
|
@@ -83,7 +83,7 @@ public class StackExchangeSideloadHelper {
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
String hash = SideloadHelper.getCrc32FileHash(sourcePath);
|
||||
|
||||
return sourcePath.getParent().resolve(STR."\{fileName}.\{hash}.db");
|
||||
return sourcePath.getParent().resolve(fileName + "." + hash + ".db");
|
||||
}
|
||||
|
||||
private static Optional<String> getStackexchangeDomainFromFilename(String fileName) {
|
||||
|
@@ -1,24 +0,0 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.gson
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
@@ -1,58 +0,0 @@
|
||||
package nu.marginalia.feedlot;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.feedlot.model.FeedItems;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
public class FeedlotClient {
|
||||
private final String feedlotHost;
|
||||
private final int feedlotPort;
|
||||
private final Gson gson;
|
||||
private final HttpClient httpClient;
|
||||
private final Duration requestTimeout;
|
||||
|
||||
public FeedlotClient(String feedlotHost,
|
||||
int feedlotPort,
|
||||
Gson gson,
|
||||
Duration connectTimeout,
|
||||
Duration requestTimeout
|
||||
)
|
||||
{
|
||||
this.feedlotHost = feedlotHost;
|
||||
this.feedlotPort = feedlotPort;
|
||||
this.gson = gson;
|
||||
|
||||
httpClient = HttpClient.newBuilder()
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.connectTimeout(connectTimeout)
|
||||
.build();
|
||||
this.requestTimeout = requestTimeout;
|
||||
}
|
||||
|
||||
public CompletableFuture<FeedItems> getFeedItems(String domainName) {
|
||||
return httpClient.sendAsync(
|
||||
HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://%s:%d/feed/%s".formatted(feedlotHost, feedlotPort, domainName)))
|
||||
.GET()
|
||||
.timeout(requestTimeout)
|
||||
.build(),
|
||||
HttpResponse.BodyHandlers.ofString()
|
||||
).thenApply(HttpResponse::body)
|
||||
.thenApply(this::parseFeedItems);
|
||||
}
|
||||
|
||||
private FeedItems parseFeedItems(String s) {
|
||||
return gson.fromJson(s, FeedItems.class);
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
httpClient.close();
|
||||
}
|
||||
}
|
@@ -1,17 +0,0 @@
|
||||
package nu.marginalia.feedlot.model;
|
||||
|
||||
public record FeedItem(String title, String date, String description, String url) {
|
||||
|
||||
public String pubDay() { // Extract the date from an ISO style date string
|
||||
if (date.length() > 10) {
|
||||
return date.substring(0, 10);
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
public String descriptionSafe() {
|
||||
return description
|
||||
.replace("<", "<")
|
||||
.replace(">", ">");
|
||||
}
|
||||
}
|
@@ -1,6 +0,0 @@
|
||||
package nu.marginalia.feedlot.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
|
||||
}
|
@@ -1,20 +0,0 @@
|
||||
Client for [FeedlotTheFeedBot](https://github.com/MarginaliaSearch/FeedLotTheFeedBot),
|
||||
the RSS/Atom feed fetcher and cache for Marginalia Search.
|
||||
|
||||
This service is external to the Marginalia Search codebase,
|
||||
as it is not a core part of the search engine and has other
|
||||
utilities.
|
||||
|
||||
## Example
|
||||
|
||||
```java
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
var client = new FeedlotClient("localhost", 8080,
|
||||
gson,
|
||||
Duration.ofMillis(100), // connect timeout
|
||||
Duration.ofMillis(100)); // request timeout
|
||||
|
||||
CompleteableFuture<FeedItems> items = client.getFeedItems("www.marginalia.nu");
|
||||
```
|
@@ -3,7 +3,6 @@ package nu.marginalia.screenshot;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
@@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import static java.lang.Integer.parseInt;
|
||||
@@ -48,7 +48,6 @@ public class ScreenshotService {
|
||||
return false;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Object serveScreenshotRequest(Request request, Response response) {
|
||||
if (Strings.isNullOrEmpty(request.params("id"))) {
|
||||
response.redirect("https://search.marginalia.nu/");
|
||||
@@ -75,6 +74,9 @@ public class ScreenshotService {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("IO error", ex);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
|
@@ -1,9 +1,9 @@
|
||||
package nu.marginalia.api.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -28,18 +28,22 @@ public class DomainsProtobufCodec {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static SimilarDomain convertResponseEntry(RpcSimilarDomain sd) {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
try {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,10 +1,7 @@
|
||||
package nu.marginalia.api.domains.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
|
||||
@ToString
|
||||
public class DomainInformation {
|
||||
EdgeDomain domain;
|
||||
|
||||
@@ -29,6 +26,34 @@ public class DomainInformation {
|
||||
String ipCountry;
|
||||
String state;
|
||||
|
||||
public DomainInformation(EdgeDomain domain, boolean blacklisted, int pagesKnown, int pagesFetched, int pagesIndexed, int incomingLinks, int outboundLinks, int nodeAffinity, double ranking, boolean suggestForCrawling, boolean inCrawlQueue, boolean unknownDomain, String ip, Integer asn, String asnOrg, String asnCountry, String ipCountry, String state) {
|
||||
this.domain = domain;
|
||||
this.blacklisted = blacklisted;
|
||||
this.pagesKnown = pagesKnown;
|
||||
this.pagesFetched = pagesFetched;
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
this.incomingLinks = incomingLinks;
|
||||
this.outboundLinks = outboundLinks;
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
this.ranking = ranking;
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
this.unknownDomain = unknownDomain;
|
||||
this.ip = ip;
|
||||
this.asn = asn;
|
||||
this.asnOrg = asnOrg;
|
||||
this.asnCountry = asnCountry;
|
||||
this.ipCountry = ipCountry;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public DomainInformation() {
|
||||
}
|
||||
|
||||
public static DomainInformationBuilder builder() {
|
||||
return new DomainInformationBuilder();
|
||||
}
|
||||
|
||||
public String getIpFlag() {
|
||||
if (ipCountry == null || ipCountry.codePointCount(0, ipCountry.length()) != 2) {
|
||||
return "";
|
||||
@@ -45,4 +70,202 @@ public class DomainInformation {
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public boolean isBlacklisted() {
|
||||
return this.blacklisted;
|
||||
}
|
||||
|
||||
public int getPagesKnown() {
|
||||
return this.pagesKnown;
|
||||
}
|
||||
|
||||
public int getPagesFetched() {
|
||||
return this.pagesFetched;
|
||||
}
|
||||
|
||||
public int getPagesIndexed() {
|
||||
return this.pagesIndexed;
|
||||
}
|
||||
|
||||
public int getIncomingLinks() {
|
||||
return this.incomingLinks;
|
||||
}
|
||||
|
||||
public int getOutboundLinks() {
|
||||
return this.outboundLinks;
|
||||
}
|
||||
|
||||
public int getNodeAffinity() {
|
||||
return this.nodeAffinity;
|
||||
}
|
||||
|
||||
public double getRanking() {
|
||||
return this.ranking;
|
||||
}
|
||||
|
||||
public boolean isSuggestForCrawling() {
|
||||
return this.suggestForCrawling;
|
||||
}
|
||||
|
||||
public boolean isInCrawlQueue() {
|
||||
return this.inCrawlQueue;
|
||||
}
|
||||
|
||||
public boolean isUnknownDomain() {
|
||||
return this.unknownDomain;
|
||||
}
|
||||
|
||||
public String getIp() {
|
||||
return this.ip;
|
||||
}
|
||||
|
||||
public Integer getAsn() {
|
||||
return this.asn;
|
||||
}
|
||||
|
||||
public String getAsnOrg() {
|
||||
return this.asnOrg;
|
||||
}
|
||||
|
||||
public String getAsnCountry() {
|
||||
return this.asnCountry;
|
||||
}
|
||||
|
||||
public String getIpCountry() {
|
||||
return this.ipCountry;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return this.state;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation(domain=" + this.getDomain() + ", blacklisted=" + this.isBlacklisted() + ", pagesKnown=" + this.getPagesKnown() + ", pagesFetched=" + this.getPagesFetched() + ", pagesIndexed=" + this.getPagesIndexed() + ", incomingLinks=" + this.getIncomingLinks() + ", outboundLinks=" + this.getOutboundLinks() + ", nodeAffinity=" + this.getNodeAffinity() + ", ranking=" + this.getRanking() + ", suggestForCrawling=" + this.isSuggestForCrawling() + ", inCrawlQueue=" + this.isInCrawlQueue() + ", unknownDomain=" + this.isUnknownDomain() + ", ip=" + this.getIp() + ", asn=" + this.getAsn() + ", asnOrg=" + this.getAsnOrg() + ", asnCountry=" + this.getAsnCountry() + ", ipCountry=" + this.getIpCountry() + ", state=" + this.getState() + ")";
|
||||
}
|
||||
|
||||
public static class DomainInformationBuilder {
|
||||
private EdgeDomain domain;
|
||||
private boolean blacklisted;
|
||||
private int pagesKnown;
|
||||
private int pagesFetched;
|
||||
private int pagesIndexed;
|
||||
private int incomingLinks;
|
||||
private int outboundLinks;
|
||||
private int nodeAffinity;
|
||||
private double ranking;
|
||||
private boolean suggestForCrawling;
|
||||
private boolean inCrawlQueue;
|
||||
private boolean unknownDomain;
|
||||
private String ip;
|
||||
private Integer asn;
|
||||
private String asnOrg;
|
||||
private String asnCountry;
|
||||
private String ipCountry;
|
||||
private String state;
|
||||
|
||||
DomainInformationBuilder() {
|
||||
}
|
||||
|
||||
public DomainInformationBuilder domain(EdgeDomain domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder blacklisted(boolean blacklisted) {
|
||||
this.blacklisted = blacklisted;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesKnown(int pagesKnown) {
|
||||
this.pagesKnown = pagesKnown;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesFetched(int pagesFetched) {
|
||||
this.pagesFetched = pagesFetched;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesIndexed(int pagesIndexed) {
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder incomingLinks(int incomingLinks) {
|
||||
this.incomingLinks = incomingLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder outboundLinks(int outboundLinks) {
|
||||
this.outboundLinks = outboundLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder nodeAffinity(int nodeAffinity) {
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ranking(double ranking) {
|
||||
this.ranking = ranking;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder suggestForCrawling(boolean suggestForCrawling) {
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder inCrawlQueue(boolean inCrawlQueue) {
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder unknownDomain(boolean unknownDomain) {
|
||||
this.unknownDomain = unknownDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ip(String ip) {
|
||||
this.ip = ip;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asn(Integer asn) {
|
||||
this.asn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnOrg(String asnOrg) {
|
||||
this.asnOrg = asnOrg;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnCountry(String asnCountry) {
|
||||
this.asnCountry = asnCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ipCountry(String ipCountry) {
|
||||
this.ipCountry = ipCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder state(String state) {
|
||||
this.state = state;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformation build() {
|
||||
return new DomainInformation(this.domain, this.blacklisted, this.pagesKnown, this.pagesFetched, this.pagesIndexed, this.incomingLinks, this.outboundLinks, this.nodeAffinity, this.ranking, this.suggestForCrawling, this.inCrawlQueue, this.unknownDomain, this.ip, this.asn, this.asnOrg, this.asnCountry, this.ipCountry, this.state);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation.DomainInformationBuilder(domain=" + this.domain + ", blacklisted=" + this.blacklisted + ", pagesKnown=" + this.pagesKnown + ", pagesFetched=" + this.pagesFetched + ", pagesIndexed=" + this.pagesIndexed + ", incomingLinks=" + this.incomingLinks + ", outboundLinks=" + this.outboundLinks + ", nodeAffinity=" + this.nodeAffinity + ", ranking=" + this.ranking + ", suggestForCrawling=" + this.suggestForCrawling + ", inCrawlQueue=" + this.inCrawlQueue + ", unknownDomain=" + this.unknownDomain + ", ip=" + this.ip + ", asn=" + this.asn + ", asnOrg=" + this.asnOrg + ", asnCountry=" + this.asnCountry + ", ipCountry=" + this.ipCountry + ", state=" + this.state + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -56,10 +56,7 @@ public class DomainInformationService {
|
||||
|
||||
ResultSet rs;
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK
|
||||
FROM EC_DOMAIN WHERE ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK\n FROM EC_DOMAIN WHERE ID=" + domainId + "\n ");
|
||||
if (rs.next()) {
|
||||
String ip = rs.getString("IP");
|
||||
|
||||
@@ -77,20 +74,14 @@ public class DomainInformationService {
|
||||
builder.setState(rs.getString("STATE"));
|
||||
builder.setRanking(Math.round(100.0*(1.0-rs.getDouble("RANK"))));
|
||||
}
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT 1 FROM CRAWL_QUEUE
|
||||
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT 1 FROM CRAWL_QUEUE\nINNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME\nWHERE EC_DOMAIN.ID=" + domainId + "\n ");
|
||||
inCrawlQueue = rs.next();
|
||||
builder.setInCrawlQueue(inCrawlQueue);
|
||||
|
||||
builder.setIncomingLinks(linkGraphClient.countLinksToDomain(domainId));
|
||||
builder.setOutboundLinks(linkGraphClient.countLinksFromDomain(domainId));
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=" + domainId + "\n ");
|
||||
if (rs.next()) {
|
||||
pagesVisited = rs.getInt("VISITED_URLS");
|
||||
|
||||
|
@@ -20,6 +20,7 @@ dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@@ -0,0 +1,85 @@
|
||||
package nu.marginalia.api.feeds;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedsClient {
|
||||
private final ExecutorService executorService = Executors.newCachedThreadPool();
|
||||
private final GrpcSingleNodeChannelPool<FeedApiGrpc.FeedApiBlockingStub> channelPool;
|
||||
private final MqOutbox updateFeedsOutbox;
|
||||
|
||||
@Inject
|
||||
public FeedsClient(GrpcChannelPoolFactory factory,
|
||||
MqPersistence mqPersistence,
|
||||
ServiceConfiguration serviceConfiguration) {
|
||||
|
||||
// The client is only interested in the primary node
|
||||
var key = ServiceKey.forGrpcApi(FeedApiGrpc.class, ServicePartition.any());
|
||||
|
||||
this.channelPool = factory.createSingle(key, FeedApiGrpc::newBlockingStub);
|
||||
this.updateFeedsOutbox = new MqOutbox(mqPersistence,
|
||||
"update-rss-feeds", 0,
|
||||
serviceConfiguration.serviceName(), serviceConfiguration.node(),
|
||||
UUID.randomUUID());
|
||||
}
|
||||
|
||||
public CompletableFuture<RpcFeed> getFeed(int domainId) {
|
||||
try {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeed)
|
||||
.async(executorService)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build());
|
||||
}
|
||||
catch (Exception e) {
|
||||
return CompletableFuture.failedFuture(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void getUpdatedDomains(Instant since, BiConsumer<String, List<String>> consumer) throws ExecutionException, InterruptedException {
|
||||
channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getUpdatedLinks)
|
||||
.run(RpcUpdatedLinksRequest.newBuilder().setSinceEpochMillis(since.toEpochMilli()).build())
|
||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||
}
|
||||
|
||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||
public String getFeedDataHash() {
|
||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||
.run(Empty.getDefaultInstance())
|
||||
.getHash();
|
||||
}
|
||||
|
||||
/** Update the feeds, return a message ID for the update */
|
||||
@CheckReturnValue
|
||||
public long updateFeeds(RpcFeedUpdateMode mode) throws Exception {
|
||||
// Create a message for the {@link MqLongRunningTask} paradigm to use for tracking the task
|
||||
long msgId = updateFeedsOutbox.sendAsync("updateFeeds", "");
|
||||
|
||||
channelPool.call(FeedApiGrpc.FeedApiBlockingStub::updateFeeds)
|
||||
.run(RpcUpdateRequest.newBuilder()
|
||||
.setMode(mode)
|
||||
.setMsgId(msgId)
|
||||
.build()
|
||||
);
|
||||
|
||||
return msgId;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,59 @@
|
||||
syntax="proto3";
|
||||
package nu.marginalia.api.feeds;
|
||||
|
||||
option java_package="nu.marginalia.api.feeds";
|
||||
option java_multiple_files=true;
|
||||
|
||||
|
||||
service FeedApi {
|
||||
rpc getFeed(RpcDomainId) returns (RpcFeed) {}
|
||||
rpc getFeedDataHash(Empty) returns (RpcFeedDataHash) {}
|
||||
rpc updateFeeds(RpcUpdateRequest) returns (Empty) {}
|
||||
rpc getUpdatedLinks(RpcUpdatedLinksRequest) returns (stream RpcUpdatedLinksResponse) {}
|
||||
}
|
||||
|
||||
message RpcUpdatedLinksRequest {
|
||||
int64 sinceEpochMillis = 1;
|
||||
}
|
||||
|
||||
message RpcUpdatedLinksResponse {
|
||||
string domain = 1;
|
||||
repeated string url = 2;
|
||||
}
|
||||
|
||||
message RpcFeedDataHash {
|
||||
string hash = 1;
|
||||
}
|
||||
|
||||
message RpcDomainId {
|
||||
int32 domainId = 1;
|
||||
}
|
||||
|
||||
message RpcUpdateRequest {
|
||||
RpcFeedUpdateMode mode = 1;
|
||||
int64 msgId = 2; // Id for a message on the message queue, will be replied to with a dummy response when the task is done,
|
||||
// if the message id is not positive, no response will be attempted to be sent.
|
||||
}
|
||||
|
||||
enum RpcFeedUpdateMode {
|
||||
CLEAN = 0; // Start over with a new database from system rss exports
|
||||
REFRESH = 1; // Refresh known feeds
|
||||
}
|
||||
|
||||
message RpcFeed {
|
||||
int32 domainId = 1;
|
||||
string domain = 2;
|
||||
string feedUrl = 3;
|
||||
string updated = 4;
|
||||
repeated RpcFeedItem items = 5;
|
||||
int64 fetchTimestamp = 6;
|
||||
}
|
||||
|
||||
message RpcFeedItem {
|
||||
string title = 1;
|
||||
string date = 2;
|
||||
string description = 3;
|
||||
string url = 4;
|
||||
}
|
||||
|
||||
message Empty {}
|
@@ -4,15 +4,12 @@ package nu.marginalia.api.livecapture;
|
||||
option java_package="nu.marginalia.api.livecapture";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service LiveCaptureApi {
|
||||
rpc requestScreengrab(RpcDomainId) returns (Empty) {}
|
||||
}
|
||||
|
||||
message Void {
|
||||
}
|
||||
|
||||
message RpcDomainId {
|
||||
int32 domainId = 1;
|
||||
}
|
||||
|
||||
service LiveCaptureApi {
|
||||
rpc requestScreengrab(RpcDomainId) returns (Empty) {}
|
||||
}
|
||||
|
||||
message Empty {}
|
@@ -20,9 +20,18 @@ dependencies {
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:execution:api')
|
||||
|
||||
implementation libs.jsoup
|
||||
implementation libs.rssreader
|
||||
implementation libs.opencsv
|
||||
implementation libs.sqlite
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
|
@@ -0,0 +1,230 @@
|
||||
package nu.marginalia.rss.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.rss.model.FeedDefinition;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.PosixFileAttributes;
|
||||
import java.security.MessageDigest;
|
||||
import java.time.Instant;
|
||||
import java.util.Base64;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
@Singleton
|
||||
public class FeedDb {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FeedDb.class);
|
||||
|
||||
private static final String dbFileName = "rss-feeds.db";
|
||||
|
||||
private final Path readerDbPath;
|
||||
private volatile FeedDbReader reader;
|
||||
|
||||
private final boolean feedDbEnabled;
|
||||
|
||||
@Inject
|
||||
public FeedDb(ServiceConfiguration serviceConfiguration) {
|
||||
feedDbEnabled = serviceConfiguration.node() <= 1;
|
||||
readerDbPath = WmsaHome.getDataPath().resolve(dbFileName);
|
||||
|
||||
if (!feedDbEnabled) {
|
||||
logger.info("Feed database is disabled on this node");
|
||||
}
|
||||
else {
|
||||
try {
|
||||
reader = new FeedDbReader(readerDbPath);
|
||||
} catch (Exception e) {
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Constructor for testing */
|
||||
public FeedDb(Path dbPath) {
|
||||
feedDbEnabled = true;
|
||||
readerDbPath = dbPath;
|
||||
|
||||
try {
|
||||
reader = new FeedDbReader(readerDbPath);
|
||||
} catch (Exception e) {
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return feedDbEnabled;
|
||||
}
|
||||
|
||||
public List<FeedDefinition> getAllFeeds() {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
|
||||
try {
|
||||
if (reader != null) {
|
||||
return reader.getAllFeeds();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting all feeds", e);
|
||||
}
|
||||
return List.of();
|
||||
}
|
||||
|
||||
public Map<String, Integer> getAllErrorCounts() {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
|
||||
try {
|
||||
if (reader != null) {
|
||||
return reader.getAllErrorCounts();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting all feeds", e);
|
||||
}
|
||||
return Map.of();
|
||||
}
|
||||
|
||||
|
||||
@NotNull
|
||||
public FeedItems getFeed(EdgeDomain domain) {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
try {
|
||||
if (reader != null) {
|
||||
return reader.getFeed(domain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting feed for " + domain, e);
|
||||
}
|
||||
return FeedItems.none();
|
||||
}
|
||||
|
||||
public Optional<String> getFeedAsJson(String domain) {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
|
||||
try {
|
||||
if (reader != null) {
|
||||
return reader.getFeedAsJson(domain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting feed for " + domain, e);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public FeedDbWriter createWriter() {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
try {
|
||||
Path dbFile = Files.createTempFile(readerDbPath.getParent(), "rss-feeds", ".tmp.db");
|
||||
return new FeedDbWriter(dbFile);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error creating new database writer", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public void switchDb(FeedDbWriter writer) {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
try {
|
||||
logger.info("Switching to new feed database from " + writer.getDbPath() + " to " + readerDbPath);
|
||||
|
||||
writer.close();
|
||||
reader.close();
|
||||
|
||||
Files.move(writer.getDbPath(), readerDbPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
||||
|
||||
reader = new FeedDbReader(readerDbPath);
|
||||
} catch (Exception e) {
|
||||
logger.error("Fatal error switching to new feed database", e);
|
||||
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
public String getDataHash() throws Exception {
|
||||
MessageDigest digest = MessageDigest.getInstance("MD5");
|
||||
|
||||
byte[] buffer = new byte[4096];
|
||||
|
||||
try (var inputStream = new BufferedInputStream(Files.newInputStream(readerDbPath))) {
|
||||
int rb;
|
||||
|
||||
while ((rb = inputStream.read(buffer)) >= 0) {
|
||||
digest.update(buffer, 0, rb);
|
||||
}
|
||||
}
|
||||
|
||||
return Base64.getEncoder().encodeToString(digest.digest());
|
||||
}
|
||||
|
||||
public void getLinksUpdatedSince(Instant since, BiConsumer<String, List<String>> consumer) throws Exception {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
|
||||
if (reader == null) {
|
||||
throw new NullPointerException("Reader is not available");
|
||||
}
|
||||
|
||||
reader.getLinksUpdatedSince(since, consumer);
|
||||
}
|
||||
|
||||
public Instant getFetchTime() {
|
||||
if (!Files.exists(readerDbPath)) {
|
||||
return Instant.ofEpochMilli(0);
|
||||
}
|
||||
|
||||
try {
|
||||
return Files.readAttributes(readerDbPath, PosixFileAttributes.class)
|
||||
.creationTime()
|
||||
.toInstant();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read the creatiom time of {}", readerDbPath);
|
||||
return Instant.ofEpochMilli(0);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,144 @@
|
||||
package nu.marginalia.rss.db;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.rss.model.FeedDefinition;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
public class FeedDbReader implements AutoCloseable {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FeedDbReader.class);
|
||||
private static final Gson gson = new GsonBuilder().create();
|
||||
private final Connection connection;
|
||||
|
||||
public FeedDbReader(Path dbPath) throws SQLException {
|
||||
String dbUrl = "jdbc:sqlite:" + dbPath.toAbsolutePath();
|
||||
|
||||
logger.info("Opening feed db at " + dbUrl);
|
||||
|
||||
connection = DriverManager.getConnection(dbUrl);
|
||||
|
||||
// Create table if it doesn't exist to avoid errors before any feeds have been fetched
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
||||
}
|
||||
}
|
||||
|
||||
public List<FeedDefinition> getAllFeeds() {
|
||||
List<FeedDefinition> ret = new ArrayList<>();
|
||||
|
||||
try (var stmt = connection.createStatement()) {
|
||||
var rs = stmt.executeQuery("""
|
||||
select
|
||||
json_extract(feed, '$.domain') as domain,
|
||||
json_extract(feed, '$.feedUrl') as url,
|
||||
json_extract(feed, '$.updated') as updated
|
||||
from feed
|
||||
""");
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(new FeedDefinition(
|
||||
rs.getString("domain"),
|
||||
rs.getString("url"),
|
||||
rs.getString("updated")));
|
||||
}
|
||||
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting all feeds", e);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public Optional<String> getFeedAsJson(String domain) {
|
||||
try (var stmt = connection.prepareStatement("SELECT FEED FROM feed WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
return Optional.of(rs.getString(1));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting feed for " + domain, e);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Map<String, Integer> getAllErrorCounts() {
|
||||
Map<String, Integer> ret = new HashMap<>(100_000);
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT domain, cnt FROM errors")) {
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
ret.put(rs.getString(1), rs.getInt(2));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting errors", e);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public FeedItems getFeed(EdgeDomain domain) {
|
||||
try (var stmt = connection.prepareStatement("SELECT FEED FROM feed WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return deserialize(rs.getString(1));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting feed for " + domain, e);
|
||||
}
|
||||
|
||||
return FeedItems.none();
|
||||
}
|
||||
|
||||
private FeedItems deserialize(String string) {
|
||||
return gson.fromJson(string, FeedItems.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
}
|
||||
|
||||
|
||||
public void getLinksUpdatedSince(Instant since, BiConsumer<String, List<String>> consumer) {
|
||||
try (var stmt = connection.prepareStatement("SELECT FEED FROM feed")) {
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
FeedItems items = deserialize(rs.getString(1));
|
||||
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var item : items.items()) {
|
||||
if (item.getUpdateTimeZD().toInstant().isAfter(since)) {
|
||||
urls.add(item.url());
|
||||
}
|
||||
}
|
||||
|
||||
if (!urls.isEmpty()) {
|
||||
consumer.accept(items.domain(), new ArrayList<>(urls));
|
||||
urls.clear();
|
||||
}
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting updated links", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user