mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
67 Commits
deploy-007
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 | ||
|
3eac4dd57f | ||
|
4c2810720a | ||
|
8480ba8daa | ||
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
72384ad6ca |
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
ext {
|
ext {
|
||||||
jvmVersion=23
|
jvmVersion = 24
|
||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.4'
|
jibVersion = '3.4.4'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
idea {
|
idea {
|
||||||
|
@@ -24,58 +24,4 @@ public class LanguageModels {
|
|||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LanguageModelsBuilder builder() {
|
|
||||||
return new LanguageModelsBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class LanguageModelsBuilder {
|
|
||||||
private Path termFrequencies;
|
|
||||||
private Path openNLPSentenceDetectionData;
|
|
||||||
private Path posRules;
|
|
||||||
private Path posDict;
|
|
||||||
private Path fasttextLanguageModel;
|
|
||||||
private Path segments;
|
|
||||||
|
|
||||||
LanguageModelsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
|
||||||
this.termFrequencies = termFrequencies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posRules(Path posRules) {
|
|
||||||
this.posRules = posRules;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posDict(Path posDict) {
|
|
||||||
this.posDict = posDict;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder segments(Path segments) {
|
|
||||||
this.segments = segments;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModels build() {
|
|
||||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -22,6 +22,7 @@ public class DbDomainQueries {
|
|||||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||||
|
|
||||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||||
|
|
||||||
@@ -59,6 +60,34 @@ public class DbDomainQueries {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||||
|
try {
|
||||||
|
return domainWithNodeCache.get(domain, () -> {
|
||||||
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||||
|
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
catch (UncheckedExecutionException ex) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
catch (ExecutionException ex) {
|
||||||
|
throw new RuntimeException(ex.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||||
|
|
||||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||||
@@ -145,4 +174,6 @@ public class DbDomainQueries {
|
|||||||
return nodeAffinity > 0;
|
return nodeAffinity > 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||||
}
|
}
|
||||||
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String topDomain;
|
public final String topDomain;
|
||||||
|
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(@Nonnull String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
host = host.toLowerCase();
|
host = host.toLowerCase();
|
||||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
|||||||
this.topDomain = topDomain;
|
this.topDomain = topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getTopDomain(String host) {
|
||||||
|
return new EdgeDomain(host).topDomain;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean looksLikeGovTld(String host) {
|
private boolean looksLikeGovTld(String host) {
|
||||||
if (host.length() < 8)
|
if (host.length() < 8)
|
||||||
return false;
|
return false;
|
||||||
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
|
||||||
StringBuilder ret = new StringBuilder();
|
|
||||||
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
ret.append(topDomain);
|
|
||||||
} else {
|
|
||||||
ret.append(topDomain, 0, cutPoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
|
||||||
ret.append(":");
|
|
||||||
ret.append(subDomain);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.toString().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/** WorkLog is a journal of work done by a process,
|
/** WorkLog is a journal of work done by a process,
|
||||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
|||||||
return new WorkLoadIterable<>(logFile, mapper);
|
return new WorkLoadIterable<>(logFile, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int countEntries(Path crawlerLog) throws IOException{
|
||||||
|
try (var linesStream = Files.lines(crawlerLog)) {
|
||||||
|
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Use synchro over concurrent set to avoid competing writes
|
// Use synchro over concurrent set to avoid competing writes
|
||||||
// - correct is better than fast here, it's sketchy enough to use
|
// - correct is better than fast here, it's sketchy enough to use
|
||||||
// a PrintWriter
|
// a PrintWriter
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.NetworkInterface;
|
import java.net.NetworkInterface;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
@@ -115,11 +116,12 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getLocalNetworkIP() throws Exception {
|
public static String getLocalNetworkIP() throws IOException {
|
||||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||||
|
|
||||||
while (nets.hasMoreElements()) {
|
while (nets.hasMoreElements()) {
|
||||||
NetworkInterface netif = nets.nextElement();
|
NetworkInterface netif = nets.nextElement();
|
||||||
|
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||||
if (!netif.isUp() || netif.isLoopback()) {
|
if (!netif.isUp() || netif.isLoopback()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -127,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||||
while (inetAddresses.hasMoreElements()) {
|
while (inetAddresses.hasMoreElements()) {
|
||||||
InetAddress addr = inetAddresses.nextElement();
|
InetAddress addr = inetAddresses.nextElement();
|
||||||
|
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||||
return addr.getHostAddress();
|
return addr.getHostAddress();
|
||||||
}
|
}
|
||||||
|
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
import org.slf4j.MarkerFactory;
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -106,9 +107,12 @@ public class JoobyService {
|
|||||||
config.externalAddress());
|
config.externalAddress());
|
||||||
|
|
||||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||||
|
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||||
|
}
|
||||||
|
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||||
|
}
|
||||||
var options = new ServerOptions();
|
var options = new ServerOptions();
|
||||||
options.setHost(config.bindAddress());
|
options.setHost(config.bindAddress());
|
||||||
options.setPort(restEndpoint.port());
|
options.setPort(restEndpoint.port());
|
||||||
|
@@ -6,17 +6,22 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
|||||||
import org.eclipse.jetty.server.Server;
|
import org.eclipse.jetty.server.Server;
|
||||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||||
import org.eclipse.jetty.servlet.ServletHolder;
|
import org.eclipse.jetty.servlet.ServletHolder;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
|
||||||
public class MetricsServer {
|
public class MetricsServer {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
public MetricsServer(ServiceConfiguration configuration) {
|
||||||
// If less than zero, we forego setting up a metrics server
|
// If less than zero, we forego setting up a metrics server
|
||||||
if (configuration.metricsPort() < 0)
|
if (configuration.metricsPort() < 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
try {
|
||||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||||
|
|
||||||
ServletContextHandler context = new ServletContextHandler();
|
ServletContextHandler context = new ServletContextHandler();
|
||||||
@@ -25,6 +30,12 @@ public class MetricsServer {
|
|||||||
|
|
||||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||||
|
|
||||||
|
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||||
|
|
||||||
server.start();
|
server.start();
|
||||||
}
|
}
|
||||||
|
catch (Exception|NoSuchMethodError ex) {
|
||||||
|
logger.error("Failed to set up metrics server", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forExpensiveRequest() {
|
|
||||||
return new RateLimiter(5, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter custom(int perMinute) {
|
public static RateLimiter custom(int perMinute) {
|
||||||
return new RateLimiter(perMinute, 60);
|
return new RateLimiter(4 * perMinute, perMinute);
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter forSpamBots() {
|
|
||||||
return new RateLimiter(120, 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forLogin() {
|
|
||||||
return new RateLimiter(3, 15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanIdleBuckets() {
|
private void cleanIdleBuckets() {
|
||||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
|||||||
class ZkServiceRegistryTest {
|
class ZkServiceRegistryTest {
|
||||||
private static final int ZOOKEEPER_PORT = 2181;
|
private static final int ZOOKEEPER_PORT = 2181;
|
||||||
private static final GenericContainer<?> zookeeper =
|
private static final GenericContainer<?> zookeeper =
|
||||||
new GenericContainer<>("zookeeper:3.8.0")
|
new GenericContainer<>("zookeeper:3.8")
|
||||||
.withExposedPorts(ZOOKEEPER_PORT);
|
.withExposedPorts(ZOOKEEPER_PORT);
|
||||||
|
|
||||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||||
|
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
|||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
|
|
||||||
private final NodeConfigurationService nodeConfigurationService;
|
private final NodeConfigurationService nodeConfigurationService;
|
||||||
private final MqPersistence persistence;
|
private final MqPersistence persistence;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public UpdateRssActor(Gson gson,
|
public UpdateRssActor(Gson gson,
|
||||||
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateRefresh(int count, long msgId) -> {
|
case UpdateRefresh(int count, long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateRefresh is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateRefresh(count, msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
|||||||
case UpdateClean(long msgId) -> {
|
case UpdateClean(long msgId) -> {
|
||||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||||
if (msg == null) {
|
if (msg == null) {
|
||||||
// Retry the update
|
logger.warn("UpdateClean is taking a very long time");
|
||||||
yield new Error("Failed to update feeds: message not found");
|
yield new UpdateClean(msgId);
|
||||||
} else if (msg.state() != MqMessageState.OK) {
|
} else if (msg.state() != MqMessageState.OK) {
|
||||||
// Retry the update
|
// Retry the update
|
||||||
yield new Error("Failed to update feeds: " + msg.state());
|
yield new Error("Failed to update feeds: " + msg.state());
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
|
|||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorage;
|
import nu.marginalia.storage.model.FileStorage;
|
||||||
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
@@ -26,14 +28,15 @@ import java.util.function.Function;
|
|||||||
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||||
|
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
|
private final ServiceHeartbeat serviceHeartbeat;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) {
|
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
|
||||||
super(gson);
|
super(gson);
|
||||||
|
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
public record Run(long fileStorageId) implements ActorStep {}
|
public record Run(long fileStorageId) implements ActorStep {}
|
||||||
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
|
|||||||
Path crawlerLog = root.resolve("crawler.log");
|
Path crawlerLog = root.resolve("crawler.log");
|
||||||
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(newCrawlerLog)) {
|
int totalEntries = WorkLog.countEntries(crawlerLog);
|
||||||
|
|
||||||
|
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||||
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
|
||||||
|
) {
|
||||||
|
int entryIdx = 0;
|
||||||
|
|
||||||
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||||
|
|
||||||
var entry = item.getKey();
|
final WorkLogEntry entry = item.getKey();
|
||||||
var path = item.getValue();
|
final Path inputPath = item.getValue();
|
||||||
|
|
||||||
logger.info("Converting {}", entry.id());
|
Path outputPath = inputPath;
|
||||||
|
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
|
||||||
|
|
||||||
|
if (inputPath.toString().endsWith(".parquet")) {
|
||||||
if (path.toFile().getName().endsWith(".parquet")) {
|
|
||||||
String domain = entry.id();
|
String domain = entry.id();
|
||||||
String id = Integer.toHexString(domain.hashCode());
|
String id = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
|
outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||||
|
|
||||||
SlopCrawlDataRecord.convertFromParquet(path, outputFile);
|
if (Files.exists(inputPath)) {
|
||||||
|
try {
|
||||||
|
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
|
||||||
|
Files.deleteIfExists(inputPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
outputPath = inputPath; // don't update the work log on error
|
||||||
|
logger.error("Failed to convert " + inputPath, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
|
||||||
|
// if the input file is missing, and the output file is missing, we just write the log
|
||||||
|
// record identical to the old one
|
||||||
|
outputPath = inputPath;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
|
// Write a log entry for the (possibly) converted file
|
||||||
}
|
workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
|
||||||
else {
|
|
||||||
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||||
Files.move(crawlerLog, oldCrawlerLog);
|
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||||
Files.move(newCrawlerLog, crawlerLog);
|
Files.move(newCrawlerLog, crawlerLog);
|
||||||
|
|
||||||
yield new End();
|
yield new End();
|
||||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id "com.google.protobuf" version "0.9.4"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
jar.archiveBaseName = 'favicon-api'
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation libs.gson
|
||||||
|
implementation libs.bundles.protobuf
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.api.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||||
|
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||||
|
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||||
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class FaviconClient {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||||
|
|
||||||
|
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||||
|
this.channelPool = factory.createMulti(
|
||||||
|
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||||
|
FaviconAPIGrpc::newBlockingStub);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record FaviconData(byte[] bytes, String contentType) {}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||||
|
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||||
|
.forNode(node)
|
||||||
|
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||||
|
|
||||||
|
if (rsp.getData().isEmpty())
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
syntax="proto3";
|
||||||
|
package marginalia.api.favicon;
|
||||||
|
|
||||||
|
option java_package="nu.marginalia.api.favicon";
|
||||||
|
option java_multiple_files=true;
|
||||||
|
|
||||||
|
service FaviconAPI {
|
||||||
|
/** Fetches information about a domain. */
|
||||||
|
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconRequest {
|
||||||
|
string domain = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RpcFaviconResponse {
|
||||||
|
string domain = 1;
|
||||||
|
bytes data = 2;
|
||||||
|
string contentType = 3;
|
||||||
|
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
|
implementation project(':code:processes:crawling-process')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.prometheus
|
||||||
|
implementation libs.guava
|
||||||
|
libs.bundles.grpc.get().each {
|
||||||
|
implementation dependencies.create(it) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.guava
|
||||||
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
|
exclude group: 'com.google.guava'
|
||||||
|
}
|
||||||
|
implementation dependencies.create(libs.spark.get()) {
|
||||||
|
exclude group: 'org.eclipse.jetty'
|
||||||
|
}
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.functions.favicon;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.ByteString;
|
||||||
|
import io.grpc.stub.StreamObserver;
|
||||||
|
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||||
|
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.service.server.DiscoverableService;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean shouldRegisterService() {
|
||||||
|
return domainStateDb.isAvailable();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||||
|
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||||
|
|
||||||
|
RpcFaviconResponse response;
|
||||||
|
if (icon.isEmpty()) {
|
||||||
|
response = RpcFaviconResponse.newBuilder().build();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var iconRecord = icon.get();
|
||||||
|
response = RpcFaviconResponse.newBuilder()
|
||||||
|
.setContentType(iconRecord.contentType())
|
||||||
|
.setDomain(request.getDomain())
|
||||||
|
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
responseObserver.onNext(response);
|
||||||
|
responseObserver.onCompleted();
|
||||||
|
}
|
||||||
|
}
|
@@ -34,6 +34,7 @@ dependencies {
|
|||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.wiremock
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
|
|||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/** Client for local browserless.io API */
|
/** Client for local browserless.io API */
|
||||||
public class BrowserlessClient implements AutoCloseable {
|
public class BrowserlessClient implements AutoCloseable {
|
||||||
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
private final URI browserlessURI;
|
private final URI browserlessURI;
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final String userAgent = WmsaHome.getUserAgent().uaString();
|
||||||
|
|
||||||
public BrowserlessClient(URI browserlessURI) {
|
public BrowserlessClient(URI browserlessURI) {
|
||||||
this.browserlessURI = browserlessURI;
|
this.browserlessURI = browserlessURI;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
if (rsp.statusCode() >= 300) {
|
if (rsp.statusCode() >= 300) {
|
||||||
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
||||||
return null;
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
return rsp.body();
|
return Optional.of(rsp.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||||
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
|
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", url,
|
"url", url,
|
||||||
|
"userAgent", userAgent,
|
||||||
"options", screenshotOptions,
|
"options", screenshotOptions,
|
||||||
"gotoOptions", gotoOptions
|
"gotoOptions", gotoOptions
|
||||||
);
|
);
|
||||||
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() {
|
||||||
httpClient.shutdownNow();
|
httpClient.shutdownNow();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,5 +1,9 @@
|
|||||||
package nu.marginalia.livecapture;
|
package nu.marginalia.livecapture;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
|
|||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.DockerImageName;
|
import org.testcontainers.utility.DockerImageName;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||||
|
|
||||||
|
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
public class BrowserlessClientTest {
|
public class BrowserlessClientTest {
|
||||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||||
|
.withNetworkMode("bridge")
|
||||||
.withExposedPorts(3000);
|
.withExposedPorts(3000);
|
||||||
|
|
||||||
|
static WireMockServer wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
static String localIp;
|
||||||
|
|
||||||
|
static URI browserlessURI;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setup() {
|
public static void setup() throws IOException {
|
||||||
container.start();
|
container.start();
|
||||||
|
|
||||||
|
browserlessURI = URI.create(String.format("http://%s:%d/",
|
||||||
|
container.getHost(),
|
||||||
|
container.getMappedPort(3000))
|
||||||
|
);
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||||
|
|
||||||
|
localIp = ServiceConfigurationModule.getLocalNetworkIP();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectContentUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.content("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testInspectScreenshotUA__Flaky() throws Exception {
|
||||||
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
|
client.screenshot("http://" + localIp + ":18089/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testContent() throws Exception {
|
public void testContent() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
|
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||||
Assertions.assertNotNull(content, "Content should not be null");
|
|
||||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testScreenshot() throws Exception {
|
public void testScreenshot() throws Exception {
|
||||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||||
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
var screenshot = client.screenshot("https://www.marginalia.nu/",
|
||||||
|
BrowserlessClient.GotoOptions.defaultValues(),
|
||||||
|
BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||||
|
|
||||||
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -134,6 +134,10 @@ public class QueryExpansion {
|
|||||||
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
||||||
graph.addVariantForSpan(prev, qw, joinedWord);
|
graph.addVariantForSpan(prev, qw, joinedWord);
|
||||||
}
|
}
|
||||||
|
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
|
||||||
|
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prev = qw;
|
prev = qw;
|
||||||
|
@@ -213,6 +213,18 @@ public class QueryFactoryTest {
|
|||||||
System.out.println(subquery);
|
System.out.println(subquery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContractionWordNum() {
|
||||||
|
var subquery = parseAndGetSpecs("glove 80");
|
||||||
|
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
|
||||||
|
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCplusPlus() {
|
public void testCplusPlus() {
|
||||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||||
|
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
||||||
|
|
||||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
||||||
|
this(name, poolSize, queueSize, ThreadType.PLATFORM);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
|
||||||
tasks = new ArrayBlockingQueue<>(queueSize);
|
tasks = new ArrayBlockingQueue<>(queueSize);
|
||||||
|
|
||||||
for (int i = 0; i < poolSize; i++) {
|
for (int i = 0; i < poolSize; i++) {
|
||||||
Thread worker = new Thread(this::worker, name + "[" + i + "]");
|
|
||||||
worker.setDaemon(true);
|
Thread.Builder threadBuilder = switch (threadType) {
|
||||||
worker.start();
|
case VIRTUAL -> Thread.ofVirtual();
|
||||||
|
case PLATFORM -> Thread.ofPlatform().daemon(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
Thread worker = threadBuilder
|
||||||
|
.name(name + "[" + i + "]")
|
||||||
|
.start(this::worker);
|
||||||
|
|
||||||
workers.add(worker);
|
workers.add(worker);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum ThreadType {
|
||||||
|
VIRTUAL,
|
||||||
|
PLATFORM
|
||||||
|
}
|
||||||
|
|
||||||
public void submit(Task task) throws InterruptedException {
|
public void submit(Task task) throws InterruptedException {
|
||||||
tasks.put(task);
|
tasks.put(task);
|
||||||
}
|
}
|
||||||
|
@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public abstract class RecordActorPrototype implements ActorPrototype {
|
public abstract class RecordActorPrototype implements ActorPrototype {
|
||||||
|
|
||||||
@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private String functionName(Class<? extends ActorStep> functionClass) {
|
private String functionName(Class<? extends ActorStep> functionClass) {
|
||||||
return functionClass.getSimpleName().toUpperCase();
|
return ActorStep.functionName(functionClass);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ActorStep constructState(String message) throws ReflectiveOperationException {
|
private ActorStep constructState(String message) throws ReflectiveOperationException {
|
||||||
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Get a list of JSON prototypes for each actor step declared by this actor */
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Map<String, String> getMessagePrototypes() {
|
||||||
|
Map<String, String> messagePrototypes = new HashMap<>();
|
||||||
|
|
||||||
|
for (var clazz : getClass().getDeclaredClasses()) {
|
||||||
|
if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
|
||||||
|
|
||||||
|
renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
|
||||||
|
|
||||||
|
messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return messagePrototypes;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
|
||||||
|
for (var field : recordType.getDeclaredFields()) {
|
||||||
|
String typeName = field.getType().getSimpleName();
|
||||||
|
|
||||||
|
if ("List".equals(typeName)) {
|
||||||
|
sj.add(String.format("\"%s\": [ ]", field.getName()));
|
||||||
|
}
|
||||||
|
else if (field.getType().isRecord()) {
|
||||||
|
var innerSj = new StringJoiner(",", "{", "}");
|
||||||
|
renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
|
||||||
|
sj.add(String.format("\"%s\": %s", field.getName(), sj));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1,3 +1,7 @@
|
|||||||
package nu.marginalia.actor.state;
|
package nu.marginalia.actor.state;
|
||||||
|
|
||||||
public interface ActorStep {}
|
public interface ActorStep {
|
||||||
|
static String functionName(Class<? extends ActorStep> type) {
|
||||||
|
return type.getSimpleName().toUpperCase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -35,6 +35,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -202,13 +203,19 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
||||||
|
|
||||||
logger.info("Processing small items");
|
logger.info("Processing small items");
|
||||||
int numBigTasks = 0;
|
|
||||||
|
// We separate the large and small domains to reduce the number of critical sections,
|
||||||
|
// as the large domains have a separate processing track that doesn't store everything
|
||||||
|
// in memory
|
||||||
|
|
||||||
|
final List<Path> bigTasks = new ArrayList<>();
|
||||||
|
|
||||||
// First process the small items
|
// First process the small items
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||||
{
|
{
|
||||||
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
||||||
numBigTasks ++;
|
bigTasks.add(dataPath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -239,15 +246,8 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
||||||
int bigTaskIdx = 0;
|
int bigTaskIdx = 0;
|
||||||
// Next the big items domain-by-domain
|
// Next the big items domain-by-domain
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
for (var dataPath : bigTasks) {
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
|
||||||
{
|
|
||||||
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
|
|
||||||
if (sizeHint < SIDELOAD_THRESHOLD) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, numBigTasks);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
||||||
@@ -255,7 +255,7 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
// will close it after it's consumed.
|
// will close it after it's consumed.
|
||||||
|
|
||||||
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
||||||
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint);
|
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
|
||||||
|
|
||||||
converterWriter.accept(writable);
|
converterWriter.accept(writable);
|
||||||
}
|
}
|
||||||
|
@@ -116,7 +116,7 @@ public class AdblockSimulator {
|
|||||||
|
|
||||||
|
|
||||||
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||||
// This version is about 100x faster than the a "clean" first stab implementation.
|
// This version is about 100x faster than a "clean" first stab implementation.
|
||||||
|
|
||||||
class RuleVisitor implements NodeFilter {
|
class RuleVisitor implements NodeFilter {
|
||||||
public boolean sawAds;
|
public boolean sawAds;
|
||||||
|
@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
|
|
||||||
var tags = doc.select("meta[name=generator]");
|
var tags = doc.select("meta[name=generator]");
|
||||||
|
|
||||||
if (tags.size() == 0) {
|
if (tags.isEmpty()) {
|
||||||
// Some sites have a comment in the head instead of a meta tag
|
// Some sites have a comment in the head instead of a meta tag
|
||||||
return fingerprintServerTech(doc, responseHeaders);
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
}
|
}
|
||||||
|
@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
}
|
}
|
||||||
fullHtml.append("</div></body></html>");
|
fullHtml.append("</div></body></html>");
|
||||||
|
|
||||||
var doc = sideloaderProcessing
|
return sideloaderProcessing
|
||||||
.processDocument(fullUrl,
|
.processDocument(fullUrl,
|
||||||
fullHtml.toString(),
|
fullHtml.toString(),
|
||||||
List.of("encyclopedia", "wiki"),
|
List.of("encyclopedia", "wiki"),
|
||||||
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
||||||
LocalDate.now().getYear(),
|
LocalDate.now().getYear(),
|
||||||
10_000_000);
|
10_000_000);
|
||||||
|
|
||||||
return doc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String normalizeUtf8(String url) {
|
private String normalizeUtf8(String url) {
|
||||||
|
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
|
|||||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumn;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
import nu.marginalia.slop.column.string.StringColumn;
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Basic information
|
// Basic information
|
||||||
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||||
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||||
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||||
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||||
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
||||||
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
|
|||||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public static class KeywordsProjectionReader extends SlopTable {
|
public static class KeywordsProjectionReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final StringColumn.Reader domainsReader;
|
||||||
private final VarintColumn.Reader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final IntColumn.Reader htmlFeaturesReader;
|
private final IntColumn.Reader htmlFeaturesReader;
|
||||||
private final LongColumn.Reader domainMetadataReader;
|
private final LongColumn.Reader domainMetadataReader;
|
||||||
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class MetadataReader extends SlopTable {
|
public static class MetadataReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final StringColumn.Reader domainsReader;
|
||||||
private final TxtStringColumn.Reader urlsReader;
|
private final StringColumn.Reader urlsReader;
|
||||||
private final VarintColumn.Reader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final StringColumn.Reader titlesReader;
|
private final StringColumn.Reader titlesReader;
|
||||||
private final StringColumn.Reader descriptionsReader;
|
private final StringColumn.Reader descriptionsReader;
|
||||||
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Writer extends SlopTable {
|
public static class Writer extends SlopTable {
|
||||||
private final TxtStringColumn.Writer domainsWriter;
|
private final StringColumn.Writer domainsWriter;
|
||||||
private final TxtStringColumn.Writer urlsWriter;
|
private final StringColumn.Writer urlsWriter;
|
||||||
private final VarintColumn.Writer ordinalsWriter;
|
private final VarintColumn.Writer ordinalsWriter;
|
||||||
private final EnumColumn.Writer statesWriter;
|
private final EnumColumn.Writer statesWriter;
|
||||||
private final StringColumn.Writer stateReasonsWriter;
|
private final StringColumn.Writer stateReasonsWriter;
|
||||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -106,9 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
|
SimpleBlockingThreadPool.ThreadType threadType;
|
||||||
|
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
|
||||||
|
}
|
||||||
|
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
Integer.getInteger("crawler.poolSize", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1,
|
||||||
|
threadType);
|
||||||
|
|
||||||
|
|
||||||
// Wait for the blacklist to be loaded before starting the crawl
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
@@ -224,10 +230,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||||
|
|
||||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
|
||||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
|
||||||
// crawling the same server in parallel from different subdomains...
|
|
||||||
Collections.shuffle(crawlSpecRecords);
|
|
||||||
|
|
||||||
// First a validation run to ensure the file is all good to parse
|
// First a validation run to ensure the file is all good to parse
|
||||||
if (crawlSpecRecords.isEmpty()) {
|
if (crawlSpecRecords.isEmpty()) {
|
||||||
@@ -248,9 +251,14 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// (this happens when the process is restarted after a crash or a shutdown)
|
// (this happens when the process is restarted after a crash or a shutdown)
|
||||||
tasksDone.set(workLog.countFinishedJobs());
|
tasksDone.set(workLog.countFinishedJobs());
|
||||||
|
|
||||||
// Create crawl tasks and submit them to the pool for execution
|
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||||
|
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||||
|
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||||
|
List<CrawlTask> taskList = new ArrayList<>();
|
||||||
|
|
||||||
|
// Create crawl tasks
|
||||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
if (workLog.isJobFinished(crawlSpec.domain))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
var task = new CrawlTask(
|
var task = new CrawlTask(
|
||||||
@@ -261,11 +269,22 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
domainStateDb,
|
domainStateDb,
|
||||||
workLog);
|
workLog);
|
||||||
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||||
pool.submitQuietly(task);
|
if (!trySubmitDeferredTask(task)) {
|
||||||
|
// Otherwise add to the taskList for deferred execution
|
||||||
|
taskList.add(task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Schedule viable tasks for execution until list is empty
|
||||||
|
while (!taskList.isEmpty()) {
|
||||||
|
taskList.removeIf(this::trySubmitDeferredTask);
|
||||||
|
|
||||||
|
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||||
|
// we might have no new viable tasks to run for hours on end
|
||||||
|
TimeUnit.MILLISECONDS.sleep(50);
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||||
|
|
||||||
pool.shutDown();
|
pool.shutDown();
|
||||||
@@ -290,6 +309,52 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||||
|
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
||||||
|
* order.
|
||||||
|
* <p></p>
|
||||||
|
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||||
|
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||||
|
* hashcode based on the fields).
|
||||||
|
* */
|
||||||
|
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||||
|
Random r = new Random();
|
||||||
|
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
||||||
|
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||||
|
|
||||||
|
for (var spec : records) {
|
||||||
|
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
||||||
|
randomOrder.put(spec.domain, r.nextInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
|
||||||
|
.reversed()
|
||||||
|
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||||
|
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||||
|
* or if it can be discarded */
|
||||||
|
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||||
|
if (!task.canRun()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||||
|
return true; // task has already run, duplicate in crawl specs
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// This blocks the caller when the pool is full
|
||||||
|
pool.submitQuietly(task);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (RuntimeException ex) {
|
||||||
|
logger.error("Failed to submit task " + task.domain, ex);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||||
}
|
}
|
||||||
@@ -346,9 +411,20 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.id = Integer.toHexString(domain.hashCode());
|
this.id = Integer.toHexString(domain.hashCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Best effort indicator whether we could start this now without getting stuck in
|
||||||
|
* DomainLocks purgatory */
|
||||||
|
public boolean canRun() {
|
||||||
|
return domainLocks.canLock(new EdgeDomain(domain));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
|
if (workLog.isJobFinished(domain)) { // No-Op
|
||||||
|
logger.info("Omitting task {}, as it is already run", domain);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||||
@@ -425,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
return new CrawlDataReference(slopPath);
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -494,7 +570,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
//
|
//
|
||||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||||
if (!inputPath.endsWith(".parquet")) {
|
if (!inputPath.toString().endsWith(".parquet")) {
|
||||||
return inputPath;
|
return inputPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,5 +1,8 @@
|
|||||||
package nu.marginalia.crawl;
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -8,6 +11,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -21,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
|
||||||
|
|
||||||
|
public record CrawlMeta(
|
||||||
|
String domainName,
|
||||||
|
Instant lastFullCrawl,
|
||||||
|
Duration recrawlTime,
|
||||||
|
Duration crawlTime,
|
||||||
|
int recrawlErrors,
|
||||||
|
int crawlChanges,
|
||||||
|
int totalCrawlSize
|
||||||
|
) {}
|
||||||
|
|
||||||
public record SummaryRecord(
|
public record SummaryRecord(
|
||||||
String domainName,
|
String domainName,
|
||||||
Instant lastUpdated,
|
Instant lastUpdated,
|
||||||
@@ -63,7 +78,29 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||||
|
|
||||||
public DomainStateDb(Path filename) throws SQLException {
|
@Inject
|
||||||
|
public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
|
||||||
|
this(findFilename(fileStorageService));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
|
||||||
|
var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
|
||||||
|
|
||||||
|
if (fsId.isPresent()) {
|
||||||
|
var fs = fileStorageService.getStorage(fsId.get());
|
||||||
|
return fs.asPath().resolve("domainstate.db");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomainStateDb(@Nullable Path filename) throws SQLException {
|
||||||
|
if (null == filename) {
|
||||||
|
connection = null;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
connection = DriverManager.getConnection(sqliteDbString);
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
|
|
||||||
@@ -77,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
feedUrl TEXT
|
feedUrl TEXT
|
||||||
)
|
)
|
||||||
""");
|
""");
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_meta (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
lastFullCrawlEpochMs LONG NOT NULL,
|
||||||
|
recrawlTimeMs LONG NOT NULL,
|
||||||
|
recrawlErrors INTEGER NOT NULL,
|
||||||
|
crawlTimeMs LONG NOT NULL,
|
||||||
|
crawlChanges INTEGER NOT NULL,
|
||||||
|
totalCrawlSize INTEGER NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
stmt.executeUpdate("""
|
stmt.executeUpdate("""
|
||||||
CREATE TABLE IF NOT EXISTS favicon (
|
CREATE TABLE IF NOT EXISTS favicon (
|
||||||
domain TEXT PRIMARY KEY,
|
domain TEXT PRIMARY KEY,
|
||||||
@@ -90,11 +138,18 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws SQLException {
|
public void close() throws SQLException {
|
||||||
|
if (connection != null) {
|
||||||
connection.close();
|
connection.close();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAvailable() {
|
||||||
|
return connection != null;
|
||||||
|
}
|
||||||
|
|
||||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||||
VALUES(?, ?, ?)
|
VALUES(?, ?, ?)
|
||||||
@@ -110,6 +165,9 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Optional<FaviconRecord> getIcon(String domain) {
|
public Optional<FaviconRecord> getIcon(String domain) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||||
stmt.setString(1, domain);
|
stmt.setString(1, domain);
|
||||||
var rs = stmt.executeQuery();
|
var rs = stmt.executeQuery();
|
||||||
@@ -129,7 +187,29 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void save(CrawlMeta crawlMeta) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, crawlMeta.domainName());
|
||||||
|
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
|
||||||
|
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
|
||||||
|
stmt.setInt(4, crawlMeta.recrawlErrors);
|
||||||
|
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
|
||||||
|
stmt.setInt(6, crawlMeta.crawlChanges);
|
||||||
|
stmt.setInt(7, crawlMeta.totalCrawlSize);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to insert crawl meta record", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void save(SummaryRecord record) {
|
public void save(SummaryRecord record) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
VALUES (?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?)
|
||||||
@@ -145,7 +225,38 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<SummaryRecord> get(String domainName) {
|
public Optional<CrawlMeta> getMeta(String domainName) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
|
||||||
|
FROM crawl_meta
|
||||||
|
WHERE domain = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(new CrawlMeta(
|
||||||
|
rs.getString("domain"),
|
||||||
|
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("crawlTimeMs")),
|
||||||
|
rs.getInt("recrawlErrors"),
|
||||||
|
rs.getInt("crawlChanges"),
|
||||||
|
rs.getInt("totalCrawlSize")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
logger.error("Failed to get crawl meta record", ex);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<SummaryRecord> getSummary(String domainName) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||||
FROM summary
|
FROM summary
|
||||||
|
@@ -29,7 +29,9 @@ import java.net.http.HttpResponse;
|
|||||||
import java.net.http.HttpTimeoutException;
|
import java.net.http.HttpTimeoutException;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
|
||||||
@@ -45,6 +47,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
@@ -54,12 +57,22 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private final HttpClient client;
|
private final HttpClient client;
|
||||||
|
|
||||||
private HttpClient createClient() {
|
private HttpClient createClient() {
|
||||||
|
final ExecutorService executorService;
|
||||||
|
|
||||||
|
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
|
||||||
|
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
executorService = Executors.newCachedThreadPool();
|
||||||
|
}
|
||||||
|
|
||||||
return HttpClient.newBuilder()
|
return HttpClient.newBuilder()
|
||||||
.sslContext(NoSecuritySSL.buildSslContext())
|
.sslContext(NoSecuritySSL.buildSslContext())
|
||||||
.cookieHandler(cookies)
|
.cookieHandler(cookies)
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
|
.version(HttpClient.Version.HTTP_1_1)
|
||||||
.connectTimeout(Duration.ofSeconds(8))
|
.connectTimeout(Duration.ofSeconds(8))
|
||||||
.executor(Executors.newCachedThreadPool())
|
.executor(executorService)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,24 +120,28 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(probeTimeout)
|
||||||
.build();
|
.build();
|
||||||
} catch (URISyntaxException e) {
|
} catch (URISyntaxException e) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int tries = 0;; tries++) {
|
||||||
try {
|
try {
|
||||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
|
|
||||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
}
|
} catch (Exception ex) {
|
||||||
catch (Exception ex) {
|
if (tries > 3) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
}
|
}
|
||||||
|
// else try again ...
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Perform a HEAD request to fetch the content type of a URL.
|
/** Perform a HEAD request to fetch the content type of a URL.
|
||||||
@@ -143,12 +160,12 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
var headBuilder = HttpRequest.newBuilder()
|
var headBuilder = HttpRequest.newBuilder()
|
||||||
.HEAD()
|
.HEAD()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.timeout(requestTimeout)
|
.timeout(requestTimeout)
|
||||||
;
|
;
|
||||||
|
|
||||||
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||||
var headers = rsp.headers();
|
var headers = rsp.headers();
|
||||||
|
|
||||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||||
@@ -215,7 +232,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
var getBuilder = HttpRequest.newBuilder()
|
var getBuilder = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept-Language", "en,*;q=0.5")
|
.header("Accept-Language", "en,*;q=0.5")
|
||||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||||
@@ -224,6 +241,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
|
try (var sl = new SendLock()) {
|
||||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||||
|
|
||||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
@@ -241,11 +259,14 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SitemapRetriever createSitemapRetriever() {
|
public SitemapRetriever createSitemapRetriever() {
|
||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Recursively fetch sitemaps */
|
||||||
@Override
|
@Override
|
||||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||||
try {
|
try {
|
||||||
@@ -265,7 +286,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||||
var head = sitemapQueue.removeFirst();
|
var head = sitemapQueue.removeFirst();
|
||||||
|
|
||||||
switch (fetchSitemap(head)) {
|
switch (fetchSingleSitemap(head)) {
|
||||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||||
|
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
@@ -301,32 +322,38 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(sitemapUrl.asURI())
|
.uri(sitemapUrl.asURI())
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.timeout(requestTimeout)
|
.timeout(requestTimeout)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
|
try (var sl = new SendLock()) {
|
||||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||||
if (response.statusCode() != 200) {
|
if (response.statusCode() != 200) {
|
||||||
return new SitemapResult.SitemapError();
|
return new SitemapResult.SitemapError();
|
||||||
}
|
}
|
||||||
|
|
||||||
try (InputStream inputStream = response.body()) {
|
Document parsedSitemap;
|
||||||
|
|
||||||
|
try (InputStream inputStream = response.body()) {
|
||||||
InputStream parserStream;
|
InputStream parserStream;
|
||||||
if (sitemapUrl.path.endsWith(".gz")) {
|
if (sitemapUrl.path.endsWith(".gz")) {
|
||||||
parserStream = new GZIPInputStream(inputStream);
|
parserStream = new GZIPInputStream(inputStream);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
parserStream = inputStream;
|
parserStream = inputStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
sl.close();
|
||||||
|
}
|
||||||
|
|
||||||
if (parsedSitemap.childrenSize() == 0) {
|
if (parsedSitemap.childrenSize() == 0) {
|
||||||
return new SitemapResult.SitemapError();
|
return new SitemapResult.SitemapError();
|
||||||
}
|
}
|
||||||
@@ -380,13 +407,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try {
|
try (var sl = new SendLock()) {
|
||||||
var getRequest = HttpRequest.newBuilder()
|
var getRequest = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(url.asURI())
|
.uri(url.asURI())
|
||||||
.header("Accept-Encoding", "gzip")
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.header("User-agent", userAgentString)
|
.header("User-Agent", userAgentString)
|
||||||
.timeout(requestTimeout);
|
.timeout(requestTimeout);
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||||
@@ -423,5 +450,30 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class SendLock implements AutoCloseable {
|
||||||
|
|
||||||
|
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
|
||||||
|
boolean closed = false;
|
||||||
|
|
||||||
|
public SendLock() {
|
||||||
|
maxConcurrentRequests.acquireUninterruptibly();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
|
||||||
|
try (var lock = new SendLock()) {
|
||||||
|
return client.send(request, handler);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
if (!closed) {
|
||||||
|
maxConcurrentRequests.release();
|
||||||
|
closed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -8,7 +8,10 @@ import java.net.http.HttpHeaders;
|
|||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.*;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/** Input buffer for temporary storage of a HTTP response
|
/** Input buffer for temporary storage of a HTTP response
|
||||||
@@ -39,7 +42,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp, Duration timeLimit) {
|
||||||
if (rsp == null)
|
if (rsp == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
@@ -51,11 +54,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(headers, is, contentLength);
|
return new MemoryBuffer(headers, timeLimit, is, contentLength);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(headers, is);
|
return new FileBuffer(headers, timeLimit, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@@ -64,9 +67,16 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final ExecutorService virtualExecutorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||||
|
|
||||||
|
private Future<Integer> readAsync(InputStream is, byte[] out) {
|
||||||
|
return virtualExecutorService.submit(() -> is.read(out));
|
||||||
|
}
|
||||||
|
|
||||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||||
protected void copy(InputStream is, OutputStream os) {
|
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
||||||
long startTime = System.currentTimeMillis();
|
Instant start = Instant.now();
|
||||||
|
Instant timeout = start.plus(timeLimit);
|
||||||
long size = 0;
|
long size = 0;
|
||||||
|
|
||||||
byte[] buffer = new byte[8192];
|
byte[] buffer = new byte[8192];
|
||||||
@@ -76,7 +86,15 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
int n = is.read(buffer);
|
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||||
|
if (remaining.isNegative()) {
|
||||||
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Future<Integer> readAsync = readAsync(is, buffer);
|
||||||
|
int n = readAsync.get(remaining.toMillis(), TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
if (n < 0) break;
|
if (n < 0) break;
|
||||||
size += n;
|
size += n;
|
||||||
os.write(buffer, 0, n);
|
os.write(buffer, 0, n);
|
||||||
@@ -85,12 +103,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
truncationReason = WarcTruncationReason.LENGTH;
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
} catch (IOException|ExecutionException e) {
|
||||||
if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) {
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
|
} catch (TimeoutException e) {
|
||||||
truncationReason = WarcTruncationReason.TIME;
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
break;
|
} catch (InterruptedException e) {
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -123,12 +140,12 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
public MemoryBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream, int size) {
|
||||||
super(headers);
|
super(headers);
|
||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(responseStream, outputStream);
|
copy(responseStream, outputStream, timeLimit);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@@ -152,7 +169,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
public FileBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(headers));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
@@ -160,7 +177,7 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
|
|
||||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(new GZIPInputStream(responseStream), out);
|
copy(new GZIPInputStream(responseStream), out, timeLimit);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
@@ -168,7 +185,7 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(responseStream, out);
|
copy(responseStream, out, timeLimit);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
|
@@ -102,7 +102,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response);
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request.timeout().orElseGet(() -> Duration.ofMillis(MAX_TIME)));
|
||||||
InputStream inputStream = inputBuffer.read())
|
InputStream inputStream = inputBuffer.read())
|
||||||
{
|
{
|
||||||
if (cookies.hasCookies()) {
|
if (cookies.hasCookies()) {
|
||||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
|||||||
return new Semaphore(2);
|
return new Semaphore(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean canLock(EdgeDomain domain) {
|
||||||
|
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||||
|
if (null == sem)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return sem.availablePermits() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
public static class DomainLock implements AutoCloseable {
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
private final Semaphore semaphore;
|
private final Semaphore semaphore;
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
|
|||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||||
try {
|
try {
|
||||||
if (sleepTime >= 1) {
|
if (sleepTime >= 1) {
|
||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||||
} else {
|
} else {
|
||||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||||
// within sane limits. This means slower servers get slower crawling, and faster
|
// within sane limits. This means slower servers get slower crawling, and faster
|
||||||
@@ -71,17 +73,17 @@ public class CrawlDelayTimer {
|
|||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(sleepTime - spentTime);
|
Thread.sleep(sleepTime - spentTime + jitter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slowDown) {
|
if (slowDown) {
|
||||||
// Additional delay when the server is signalling it wants slower requests
|
// Additional delay when the server is signalling it wants slower requests
|
||||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException e) {
|
catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
throw new RuntimeException();
|
throw new RuntimeException("Interrupted", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -26,6 +26,8 @@ import java.io.IOException;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@@ -108,15 +110,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||||
domainStateDb.save(summaryRecord);
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
// There's a small chance we're interrupted during the sniffing portion
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
Instant recrawlStart = Instant.now();
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
|
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
if (recrawlMetadata.size() > 0) {
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||||
|
|
||||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
|
||||||
}
|
}
|
||||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
@@ -138,16 +149,28 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private int crawlDomain(EdgeUrl rootUrl,
|
private int crawlDomain(EdgeUrl rootUrl,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer,
|
CrawlDelayTimer delayTimer,
|
||||||
DomainLinks domainLinks) {
|
DomainLinks domainLinks,
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
|
||||||
|
Duration recrawlTime) {
|
||||||
|
|
||||||
|
Instant crawlStart = Instant.now();
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
// Fetch sitemaps
|
// Fetch sitemaps
|
||||||
for (var sitemap : robotsRules.getSitemaps()) {
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
|
|
||||||
|
// Validate the sitemap URL and check if it belongs to the domain as the root URL
|
||||||
|
if (EdgeUrl.parse(sitemap)
|
||||||
|
.map(url -> url.getDomain().equals(rootUrl.domain))
|
||||||
|
.orElse(false)) {
|
||||||
|
|
||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int crawlerAdditions = 0;
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
@@ -180,7 +203,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||||
|
|
||||||
|
if (result.isOk()) {
|
||||||
|
crawlerAdditions++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
@@ -188,6 +215,17 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Duration crawlTime = Duration.between(crawlStart, Instant.now());
|
||||||
|
domainStateDb.save(new DomainStateDb.CrawlMeta(
|
||||||
|
domain,
|
||||||
|
Instant.now(),
|
||||||
|
recrawlTime,
|
||||||
|
crawlTime,
|
||||||
|
recrawlMetadata.errors(),
|
||||||
|
crawlerAdditions,
|
||||||
|
recrawlMetadata.size() + crawlerAdditions
|
||||||
|
));
|
||||||
|
|
||||||
return crawlFrontier.visitedSize();
|
return crawlFrontier.visitedSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -289,6 +327,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
crawlFrontier.addVisited(rootUrl);
|
crawlFrontier.addVisited(rootUrl);
|
||||||
@@ -316,7 +358,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
var oldDomainStateRecord = domainStateDb.get(domain);
|
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||||
|
|
||||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||||
if (oldDomainStateRecord.isPresent()) {
|
if (oldDomainStateRecord.isPresent()) {
|
||||||
@@ -381,8 +423,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
if (docOpt.isPresent()) {
|
if (docOpt.isPresent()) {
|
||||||
var doc = docOpt.get();
|
var doc = docOpt.get();
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(top, doc);
|
var responseUrl = new EdgeUrl(ok.uri());
|
||||||
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
|
|
||||||
|
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||||
|
crawlFrontier.addVisited(responseUrl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||||
|
@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||||
public int recrawl(CrawlDataReference oldCrawlData,
|
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer)
|
CrawlDelayTimer delayTimer)
|
||||||
throws InterruptedException {
|
throws InterruptedException {
|
||||||
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
|
|||||||
int retained = 0;
|
int retained = 0;
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
int skipped = 0;
|
int skipped = 0;
|
||||||
|
int size = 0;
|
||||||
|
|
||||||
for (CrawledDocument doc : oldCrawlData) {
|
for (CrawledDocument doc : oldCrawlData) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
@@ -46,6 +47,10 @@ public class CrawlerRevisitor {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
if (urlMaybe.isEmpty())
|
if (urlMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
@@ -78,6 +83,7 @@ public class CrawlerRevisitor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size++;
|
||||||
|
|
||||||
double skipProb;
|
double skipProb;
|
||||||
|
|
||||||
@@ -150,6 +156,8 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return recrawled;
|
return new RecrawlMetadata(size, errors, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record RecrawlMetadata(int size, int errors, int skipped) {}
|
||||||
}
|
}
|
||||||
|
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
|||||||
{
|
{
|
||||||
|
|
||||||
String fileName = fullPath.getFileName().toString();
|
String fileName = fullPath.getFileName().toString();
|
||||||
if (fileName.endsWith(".parquet")) {
|
|
||||||
|
if (fileName.endsWith(".slop.zip")) {
|
||||||
try {
|
try {
|
||||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
return new SlopSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fileName.endsWith(".slop.zip")) {
|
else if (fileName.endsWith(".parquet")) {
|
||||||
|
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||||
try {
|
try {
|
||||||
return new SlopSerializableCrawlDataStream(fullPath);
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
|
@@ -12,8 +12,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
*/
|
*/
|
||||||
@@ -65,7 +64,21 @@ public sealed interface HttpFetchResult {
|
|||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
|
|
||||||
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||||
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||||
|
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||||
|
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||||
|
|
||||||
|
inputMap.forEach((k, v) -> {
|
||||||
|
if (k.isBlank()) return;
|
||||||
|
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||||
|
|
||||||
|
filteredMap.put(k, v);
|
||||||
|
});
|
||||||
|
|
||||||
|
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
|
@@ -8,6 +8,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
@@ -47,8 +48,8 @@ class DomainStateDbTest {
|
|||||||
db.save(allFields);
|
db.save(allFields);
|
||||||
db.save(minFields);
|
db.save(minFields);
|
||||||
|
|
||||||
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||||
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
|
||||||
|
|
||||||
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||||
"all.marginalia.nu",
|
"all.marginalia.nu",
|
||||||
@@ -59,7 +60,19 @@ class DomainStateDbTest {
|
|||||||
);
|
);
|
||||||
|
|
||||||
db.save(updatedAllFields);
|
db.save(updatedAllFields);
|
||||||
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMetadata() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
|
||||||
|
db.save(original);
|
||||||
|
|
||||||
|
var maybeMeta = db.getMeta("example.com");
|
||||||
|
assertTrue(maybeMeta.isPresent());
|
||||||
|
assertEquals(original, maybeMeta.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -0,0 +1,152 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import com.sun.net.httpserver.HttpServer;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.netpreserve.jwarc.WarcReader;
|
||||||
|
import org.netpreserve.jwarc.WarcRequest;
|
||||||
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class WarcRecorderFakeServerTest {
|
||||||
|
static HttpServer server;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() throws IOException {
|
||||||
|
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
|
||||||
|
|
||||||
|
// This endpoint will finish sending the response immediately
|
||||||
|
server.createContext("/fast", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
|
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
|
||||||
|
|
||||||
|
try (var os = exchange.getResponseBody()) {
|
||||||
|
os.write("<html><body>hello</body></html>".getBytes());
|
||||||
|
os.flush();
|
||||||
|
}
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
// This endpoint will take 10 seconds to finish sending the response,
|
||||||
|
// which should trigger a timeout in the client
|
||||||
|
server.createContext("/slow", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
|
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
|
||||||
|
|
||||||
|
try (var os = exchange.getResponseBody()) {
|
||||||
|
os.write("<html><body>hello</body></html>".getBytes());
|
||||||
|
os.flush();
|
||||||
|
try {
|
||||||
|
TimeUnit.SECONDS.sleep(10);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
os.write(":D".getBytes());
|
||||||
|
os.flush();
|
||||||
|
}
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
server.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void tearDownAll() {
|
||||||
|
server.stop(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
Path fileNameWarc;
|
||||||
|
Path fileNameParquet;
|
||||||
|
WarcRecorder client;
|
||||||
|
|
||||||
|
HttpClient httpClient;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
httpClient = HttpClient.newBuilder().build();
|
||||||
|
|
||||||
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
|
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
client.close();
|
||||||
|
Files.delete(fileNameWarc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void fetchFast() throws Exception {
|
||||||
|
client.fetch(httpClient,
|
||||||
|
HttpRequest.newBuilder()
|
||||||
|
.uri(new java.net.URI("http://localhost:14510/fast"))
|
||||||
|
.timeout(Duration.ofSeconds(1))
|
||||||
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build()
|
||||||
|
);
|
||||||
|
|
||||||
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
|
warcReader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
sampleData.put(record.type(), req.target());
|
||||||
|
}
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
sampleData.put(record.type(), rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(sampleData);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void fetchSlow() throws Exception {
|
||||||
|
Instant start = Instant.now();
|
||||||
|
client.fetch(httpClient,
|
||||||
|
HttpRequest.newBuilder()
|
||||||
|
.uri(new java.net.URI("http://localhost:14510/slow"))
|
||||||
|
.timeout(Duration.ofSeconds(1))
|
||||||
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build()
|
||||||
|
);
|
||||||
|
Instant end = Instant.now();
|
||||||
|
|
||||||
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
|
warcReader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
sampleData.put(record.type(), req.target());
|
||||||
|
}
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
sampleData.put(record.type(), rsp.target());
|
||||||
|
System.out.println(rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(sampleData);
|
||||||
|
|
||||||
|
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||||
|
// so we expect the request to take 1s and change before it times out.
|
||||||
|
|
||||||
|
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
@@ -41,6 +41,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:functions:live-capture:api')
|
implementation project(':code:functions:live-capture:api')
|
||||||
implementation project(':code:functions:math:api')
|
implementation project(':code:functions:math:api')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
implementation project(':code:functions:domain-info:api')
|
implementation project(':code:functions:domain-info:api')
|
||||||
implementation project(':code:functions:search-query:api')
|
implementation project(':code:functions:search-query:api')
|
||||||
|
|
||||||
|
@@ -3,8 +3,14 @@ package nu.marginalia.search;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import io.jooby.Context;
|
import io.jooby.Context;
|
||||||
import io.jooby.Jooby;
|
import io.jooby.Jooby;
|
||||||
|
import io.jooby.MediaType;
|
||||||
|
import io.jooby.StatusCode;
|
||||||
import io.prometheus.client.Counter;
|
import io.prometheus.client.Counter;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
|
import nu.marginalia.WebsiteUrl;
|
||||||
|
import nu.marginalia.api.favicon.FaviconClient;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.search.svc.*;
|
import nu.marginalia.search.svc.*;
|
||||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
@@ -13,10 +19,14 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
public class SearchService extends JoobyService {
|
public class SearchService extends JoobyService {
|
||||||
|
|
||||||
|
private final WebsiteUrl websiteUrl;
|
||||||
private final SearchSiteSubscriptionService siteSubscriptionService;
|
private final SearchSiteSubscriptionService siteSubscriptionService;
|
||||||
|
private final FaviconClient faviconClient;
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||||
@@ -33,12 +43,15 @@ public class SearchService extends JoobyService {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchService(BaseServiceParams params,
|
public SearchService(BaseServiceParams params,
|
||||||
|
WebsiteUrl websiteUrl,
|
||||||
SearchFrontPageService frontPageService,
|
SearchFrontPageService frontPageService,
|
||||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||||
SearchSiteSubscriptionService siteSubscriptionService,
|
SearchSiteSubscriptionService siteSubscriptionService,
|
||||||
SearchSiteInfoService siteInfoService,
|
SearchSiteInfoService siteInfoService,
|
||||||
SearchCrosstalkService crosstalkService,
|
SearchCrosstalkService crosstalkService,
|
||||||
SearchBrowseService searchBrowseService,
|
SearchBrowseService searchBrowseService,
|
||||||
|
FaviconClient faviconClient,
|
||||||
|
DbDomainQueries domainQueries,
|
||||||
SearchQueryService searchQueryService)
|
SearchQueryService searchQueryService)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
super(params,
|
super(params,
|
||||||
@@ -51,8 +64,11 @@ public class SearchService extends JoobyService {
|
|||||||
new SearchAddToCrawlQueueService_(addToCrawlQueueService),
|
new SearchAddToCrawlQueueService_(addToCrawlQueueService),
|
||||||
new SearchBrowseService_(searchBrowseService)
|
new SearchBrowseService_(searchBrowseService)
|
||||||
));
|
));
|
||||||
|
this.websiteUrl = websiteUrl;
|
||||||
|
|
||||||
this.siteSubscriptionService = siteSubscriptionService;
|
this.siteSubscriptionService = siteSubscriptionService;
|
||||||
|
this.faviconClient = faviconClient;
|
||||||
|
this.domainQueries = domainQueries;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -62,6 +78,36 @@ public class SearchService extends JoobyService {
|
|||||||
final String startTimeAttribute = "start-time";
|
final String startTimeAttribute = "start-time";
|
||||||
|
|
||||||
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
|
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
|
||||||
|
|
||||||
|
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
|
||||||
|
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
|
||||||
|
|
||||||
|
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
|
||||||
|
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||||
|
String domain = ctx.path("domain").value();
|
||||||
|
logger.info("Finding icon for domain {}", domain);
|
||||||
|
try {
|
||||||
|
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
||||||
|
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
||||||
|
|
||||||
|
if (faviconMaybe.isEmpty()) {
|
||||||
|
ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
|
||||||
|
return emptySvg;
|
||||||
|
} else {
|
||||||
|
var favicon = faviconMaybe.get();
|
||||||
|
|
||||||
|
ctx.responseStream(MediaType.valueOf(favicon.contentType()), consumer -> {
|
||||||
|
consumer.write(favicon.bytes());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (NoSuchElementException ex) {
|
||||||
|
ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
|
||||||
|
return emptySvg;
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
});
|
||||||
|
|
||||||
jooby.before((Context ctx) -> {
|
jooby.before((Context ctx) -> {
|
||||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||||
});
|
});
|
||||||
@@ -80,5 +126,19 @@ public class SearchService extends JoobyService {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Redirect handler for the case when the user passes
|
||||||
|
* an url like /site/https://example.com/, in this
|
||||||
|
* scenario we want to extract the domain name and redirect
|
||||||
|
* to /site/example.com/
|
||||||
|
*/
|
||||||
|
private Context handleSiteUrlRedirect(Context ctx) {
|
||||||
|
var pv = ctx.path("*").value();
|
||||||
|
int trailSlash = pv.indexOf('/');
|
||||||
|
if (trailSlash > 0) {
|
||||||
|
pv = pv.substring(0, trailSlash);
|
||||||
|
}
|
||||||
|
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
|
|
||||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
|
|||||||
public String renderUrl() {
|
public String renderUrl() {
|
||||||
|
|
||||||
StringBuilder pathBuilder = new StringBuilder("/search?");
|
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
|
||||||
|
|
||||||
|
if (query != null) {
|
||||||
|
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
if (profile != SearchProfile.NO_FILTER) {
|
if (profile != SearchProfile.NO_FILTER) {
|
||||||
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
|
@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
|
|||||||
return focusDomainId >= 0;
|
return focusDomainId >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return results.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
public SearchFilters getFilters() {
|
public SearchFilters getFilters() {
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
@@ -81,6 +81,7 @@ public class SearchFilters {
|
|||||||
),
|
),
|
||||||
List.of(
|
List.of(
|
||||||
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
||||||
|
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
|
||||||
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
||||||
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
||||||
),
|
),
|
||||||
|
@@ -56,7 +56,9 @@ public class SearchQueryService {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error", ex);
|
logger.error("Error", ex);
|
||||||
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page));
|
return errorPageService.serveError(
|
||||||
|
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Explore")
|
@template.part.head(title = "Marginalia Search - Explore")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans ">
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans ">
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@
|
|||||||
</header>
|
</header>
|
||||||
|
|
||||||
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
|
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
|
||||||
<div class="border dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
<div class="border border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
||||||
@if (results.hasFocusDomain())
|
@if (results.hasFocusDomain())
|
||||||
<div class="flex space-x-1">
|
<div class="flex space-x-1">
|
||||||
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
|
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
|
||||||
@@ -36,7 +36,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
|
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
|
||||||
@for (BrowseResult result : results.results())
|
@for (BrowseResult result : results.results())
|
||||||
<div class="bg-white border dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
<div class="bg-white border border-gray-300 dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
||||||
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
|
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
|
||||||
<span class="break-words">${result.displayDomain()}</span>
|
<span class="break-words">${result.displayDomain()}</span>
|
||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
<span>
|
<span>
|
||||||
Access logs containing IP-addresses are retained for up to 24 hours,
|
Access logs containing IP-addresses are retained for up to 24 hours,
|
||||||
anonymized logs with source addresses removed are sometimes kept longer
|
anonymized logs with source addresses removed are sometimes kept longer
|
||||||
to help diagnosing bugs.
|
to help diagnose bugs.
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex space-y-4 flex-col">
|
<div class="flex space-y-4 flex-col">
|
||||||
|
@@ -9,6 +9,15 @@
|
|||||||
nicotine: '#f8f8ee',
|
nicotine: '#f8f8ee',
|
||||||
margeblue: '#3e5f6f',
|
margeblue: '#3e5f6f',
|
||||||
liteblue: '#0066cc',
|
liteblue: '#0066cc',
|
||||||
|
bgblue: '#e5e9eb',
|
||||||
|
},
|
||||||
|
screens: {
|
||||||
|
'coarsepointer': {
|
||||||
|
'raw': '(pointer: coarse)'
|
||||||
|
},
|
||||||
|
'finepointer': {
|
||||||
|
'raw': '(pointer: fine)'
|
||||||
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
screens: {
|
screens: {
|
||||||
|
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Error")
|
@template.part.head(title = "Marginalia Search - Error")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + results.getQuery())
|
@template.part.head(title = "Marginalia Search - " + results.getQuery())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
@@ -23,7 +23,7 @@
|
|||||||
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
||||||
</div>
|
</div>
|
||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||||
<i class="fas fa-filter mr-3"></i>
|
<i class="fas fa-filter mr-3"></i>
|
||||||
Filters
|
Filters
|
||||||
</button>
|
</button>
|
||||||
@@ -44,6 +44,11 @@
|
|||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
|
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
|
||||||
</div>
|
</div>
|
||||||
|
@elseif (results.isEmpty())
|
||||||
|
<div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
|
||||||
|
No search results found. Try different search terms, or spelling variations. The search engine currently
|
||||||
|
only supports queries in the English language.
|
||||||
|
</div>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
<div class="space-y-4 sm:space-y-6">
|
<div class="space-y-4 sm:space-y-6">
|
||||||
|
@@ -26,15 +26,15 @@
|
|||||||
It operates a bit like a clock, starting at the top and working its way around clockwise.</p>
|
It operates a bit like a clock, starting at the top and working its way around clockwise.</p>
|
||||||
|
|
||||||
<div class="flex gap-4 place-items-middle">
|
<div class="flex gap-4 place-items-middle">
|
||||||
@template.serp.part.matchogram(mask = 90)
|
@template.serp.part.matchogram(mask = 90, domain = "example.com")
|
||||||
<div>This is by the beginning</div>
|
<div>This is by the beginning</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex gap-4 place-items-middle">
|
<div class="flex gap-4 place-items-middle">
|
||||||
@template.serp.part.matchogram(mask = 90L<<26)
|
@template.serp.part.matchogram(mask = 90L<<26, domain = "example.com")
|
||||||
<div>This is in the middle</div>
|
<div>This is in the middle</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex gap-4 place-items-middle">
|
<div class="flex gap-4 place-items-middle">
|
||||||
@template.serp.part.matchogram(mask = 5L<<48)
|
@template.serp.part.matchogram(mask = 5L<<48, domain = "example.com")
|
||||||
<div>This is toward the end</div>
|
<div>This is toward the end</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@@ -1,11 +1,13 @@
|
|||||||
@import java.util.stream.IntStream
|
@import java.util.stream.IntStream
|
||||||
|
|
||||||
@param long mask
|
@param long mask
|
||||||
|
@param String domain
|
||||||
|
|
||||||
<svg width="40" height="40">
|
<svg width="40" height="40"
|
||||||
|
style="background-image: url('/site/${domain}/favicon'); background-repeat: no-repeat; background-size: 16px 16px; background-position: center; ">
|
||||||
<circle
|
<circle
|
||||||
cx="18"
|
cx="20"
|
||||||
cy="18"
|
cy="20"
|
||||||
r="16"
|
r="16"
|
||||||
fill="none"
|
fill="none"
|
||||||
stroke="#eee"
|
stroke="#eee"
|
||||||
@@ -13,10 +15,10 @@
|
|||||||
/>
|
/>
|
||||||
@for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray())
|
@for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray())
|
||||||
<line
|
<line
|
||||||
x1="${18 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
x1="${20 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||||
y1="${18 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
y1="${20 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||||
x2="${18 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
x2="${20 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||||
y2="${18 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
y2="${20 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||||
stroke="#444"
|
stroke="#444"
|
||||||
stroke-width="2"
|
stroke-width="2"
|
||||||
/>
|
/>
|
||||||
|
@@ -12,7 +12,7 @@
|
|||||||
<div class="flex flex-col grow" >
|
<div class="flex flex-col grow" >
|
||||||
<div class="flex flex-row space-x-2 place-items-center">
|
<div class="flex flex-row space-x-2 place-items-center">
|
||||||
<div class="flex-0" title="Match density">
|
<div class="flex-0" title="Match density">
|
||||||
@template.serp.part.matchogram(mask = result.first.positionsMask)
|
@template.serp.part.matchogram(mask = result.first.positionsMask, domain=result.getFirst().url.domain.toString())
|
||||||
</div>
|
</div>
|
||||||
<div class="flex grow justify-between items-start">
|
<div class="flex grow justify-between items-start">
|
||||||
<div class="flex-1">
|
<div class="flex-1">
|
||||||
|
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
@param SearchFilters filters
|
@param SearchFilters filters
|
||||||
|
|
||||||
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
|
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
|
||||||
<div class="space-y-6 sticky top-4">
|
<div class="space-y-6 sticky top-4">
|
||||||
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
||||||
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
|
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
|
||||||
@for (SearchFilters.Filter filter : filterGroup)
|
@for (SearchFilters.Filter filter : filterGroup)
|
||||||
<label class="flex items-center">
|
<label class="flex items-center">
|
||||||
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||||
@if (filter.current)
|
@if (filter.current)
|
||||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||||
@else
|
@else
|
||||||
@@ -38,7 +38,7 @@
|
|||||||
<div class="space-y-2">
|
<div class="space-y-2">
|
||||||
@for (SearchFilters.SearchOption option : filters.searchOptions())
|
@for (SearchFilters.SearchOption option : filters.searchOptions())
|
||||||
<label class="flex items-center">
|
<label class="flex items-center">
|
||||||
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||||
@if (option.isSet())
|
@if (option.isSet())
|
||||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||||
@else
|
@else
|
||||||
|
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search", allowIndexing = true)
|
@template.part.head(title = "Marginalia Search", allowIndexing = true)
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
@@ -32,18 +32,14 @@
|
|||||||
|
|
||||||
@if (model.news().isEmpty())
|
@if (model.news().isEmpty())
|
||||||
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
||||||
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
<div class="border border-gray-300 border-gray-100 dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
The old version of Marginalia Search remains available
|
||||||
This is the new design and home of Marginalia Search.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
|
||||||
<p class="my-4"></p>
|
|
||||||
The old version of Marginalia Search remains available at
|
|
||||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
||||||
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Prioritizes non-commercial content</li>
|
<li>Prioritizes non-commercial content</li>
|
||||||
@@ -52,7 +48,7 @@
|
|||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||||
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Custom index and crawler software</li>
|
<li>Custom index and crawler software</li>
|
||||||
@@ -65,7 +61,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||||
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Filter out tracking and adtech</li>
|
<li>Filter out tracking and adtech</li>
|
||||||
|
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
|
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + model.domain())
|
@template.part.head(title = "Marginalia Search - " + model.domain())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
@if (!list.isEmpty())
|
@if (!list.isEmpty())
|
||||||
|
|
||||||
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border dark:border-gray-600">
|
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border border-gray-300 dark:border-gray-600">
|
||||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
|
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
|
||||||
<h2 class="text-md">${title}</h2>
|
<h2 class="text-md">${title}</h2>
|
||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
|
@@ -9,11 +9,11 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Site Viewer")
|
@template.part.head(title = "Marginalia Search - Site Viewer")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<header class="border-b border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
<header class="border-b border-gray-300 border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
||||||
<div class="max-w-[1400px] mx-auto px-4 py-4">
|
<div class="max-w-[1400px] mx-auto px-4 py-4">
|
||||||
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
|
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
|
||||||
</div>
|
</div>
|
||||||
@@ -22,7 +22,7 @@
|
|||||||
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
|
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
|
||||||
|
|
||||||
|
|
||||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
||||||
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
|
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
|
||||||
|
|
||||||
<p class="mx-4">This utility lets you explore what the search engine knows about the web,
|
<p class="mx-4">This utility lets you explore what the search engine knows about the web,
|
||||||
@@ -45,7 +45,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
@if (!model.domains().isEmpty())
|
@if (!model.domains().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
||||||
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>
|
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>
|
||||||
|
|
||||||
|
|
||||||
|
@@ -8,17 +8,17 @@
|
|||||||
<div class="flex flex-col space-y-4 my-4 w-full">
|
<div class="flex flex-col space-y-4 my-4 w-full">
|
||||||
|
|
||||||
@if (backlinks.results().isEmpty())
|
@if (backlinks.results().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
||||||
The search engine isn't aware of any backlinks to ${backlinks.domain()}!
|
The search engine isn't aware of any backlinks to ${backlinks.domain()}!
|
||||||
</div>
|
</div>
|
||||||
@else
|
@else
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
Showing documents linking to ${backlinks.domain()}
|
Showing documents linking to ${backlinks.domain()}
|
||||||
</div>
|
</div>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@for (GroupedUrlDetails group : backlinks.results())
|
@for (GroupedUrlDetails group : backlinks.results())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
<div class="border dark:border-gray-600 border-gray-300 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||||
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
|
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
|
||||||
<span class="fas fa-globe"></span>
|
<span class="fas fa-globe"></span>
|
||||||
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>
|
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>
|
||||||
|
@@ -9,17 +9,17 @@
|
|||||||
<div class="flex flex-col space-y-4 my-4">
|
<div class="flex flex-col space-y-4 my-4">
|
||||||
|
|
||||||
@if (docs.results().isEmpty())
|
@if (docs.results().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
The search engine doesn't index any documents from ${docs.domain()}
|
The search engine doesn't index any documents from ${docs.domain()}
|
||||||
</div>
|
</div>
|
||||||
@else
|
@else
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
Showing documents from ${docs.domain()}
|
Showing documents from ${docs.domain()}
|
||||||
</div>
|
</div>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@for (UrlDetails details : docs.results())
|
@for (UrlDetails details : docs.results())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||||
<div class="flex grow justify-between items-start p-4">
|
<div class="flex grow justify-between items-start p-4">
|
||||||
<div class="flex-1">
|
<div class="flex-1">
|
||||||
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">
|
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">
|
||||||
|
@@ -8,9 +8,9 @@
|
|||||||
<!-- Main content -->
|
<!-- Main content -->
|
||||||
|
|
||||||
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
||||||
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
<div class="flex border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||||
<div class="flex place-items-baseline space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||||
<i class="fa fa-globe"></i>
|
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
||||||
<span>${siteInfo.domain()}</span>
|
<span>${siteInfo.domain()}</span>
|
||||||
<div class="grow">
|
<div class="grow">
|
||||||
</div>
|
</div>
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
@param ReportDomain reportDomain
|
@param ReportDomain reportDomain
|
||||||
|
|
||||||
<div class="flex-col mx-auto">
|
<div class="flex-col mx-auto">
|
||||||
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border border-gray-300 dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
||||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
|
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
|
||||||
<h2 class="text-md">Report Domain Issue</h2>
|
<h2 class="text-md">Report Domain Issue</h2>
|
||||||
</div>
|
</div>
|
||||||
|
@@ -9,6 +9,15 @@ module.exports = {
|
|||||||
nicotine: '#f8f8ee',
|
nicotine: '#f8f8ee',
|
||||||
margeblue: '#3e5f6f',
|
margeblue: '#3e5f6f',
|
||||||
liteblue: '#0066cc',
|
liteblue: '#0066cc',
|
||||||
|
bgblue: '#e5e9eb',
|
||||||
|
},
|
||||||
|
screens: {
|
||||||
|
'coarsepointer': {
|
||||||
|
'raw': '(pointer: coarse)'
|
||||||
|
},
|
||||||
|
'finepointer': {
|
||||||
|
'raw': '(pointer: fine)'
|
||||||
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
screens: {
|
screens: {
|
||||||
|
@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
|||||||
apply from: "$rootProject.projectDir/docker.gradle"
|
apply from: "$rootProject.projectDir/docker.gradle"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party:symspell')
|
|
||||||
|
|
||||||
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
|
||||||
implementation project(':code:functions:live-capture')
|
implementation project(':code:functions:live-capture')
|
||||||
implementation project(':code:functions:live-capture:api')
|
implementation project(':code:functions:live-capture:api')
|
||||||
@@ -32,20 +37,16 @@ dependencies {
|
|||||||
implementation project(':code:functions:domain-info')
|
implementation project(':code:functions:domain-info')
|
||||||
implementation project(':code:functions:domain-info:api')
|
implementation project(':code:functions:domain-info:api')
|
||||||
|
|
||||||
implementation project(':code:common:config')
|
|
||||||
implementation project(':code:common:service')
|
|
||||||
implementation project(':code:common:model')
|
|
||||||
implementation project(':code:common:db')
|
|
||||||
|
|
||||||
implementation project(':code:features-search:screenshots')
|
|
||||||
|
|
||||||
implementation project(':code:libraries:geo-ip')
|
implementation project(':code:libraries:geo-ip')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
implementation project(':code:libraries:term-frequency-dict')
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation project(':third-party:symspell')
|
||||||
|
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
|
implementation libs.commons.io
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
libs.bundles.grpc.get().each {
|
libs.bundles.grpc.get().each {
|
||||||
implementation dependencies.create(it) {
|
implementation dependencies.create(it) {
|
||||||
@@ -59,9 +60,7 @@ dependencies {
|
|||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
implementation dependencies.create(libs.spark.get()) {
|
implementation libs.bundles.jooby
|
||||||
exclude group: 'org.eclipse.jetty'
|
|
||||||
}
|
|
||||||
implementation libs.bundles.jetty
|
implementation libs.bundles.jetty
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
|
@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
|
import io.jooby.ExecutionMode;
|
||||||
|
import io.jooby.Jooby;
|
||||||
import nu.marginalia.livecapture.LivecaptureModule;
|
import nu.marginalia.livecapture.LivecaptureModule;
|
||||||
import nu.marginalia.service.MainClass;
|
import nu.marginalia.service.MainClass;
|
||||||
import nu.marginalia.service.ServiceId;
|
import nu.marginalia.service.ServiceId;
|
||||||
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
|
|||||||
var configuration = injector.getInstance(ServiceConfiguration.class);
|
var configuration = injector.getInstance(ServiceConfiguration.class);
|
||||||
orchestrateBoot(registry, configuration);
|
orchestrateBoot(registry, configuration);
|
||||||
|
|
||||||
injector.getInstance(AssistantMain.class);
|
var main = injector.getInstance(AssistantMain.class);
|
||||||
injector.getInstance(Initialization.class).setReady();
|
injector.getInstance(Initialization.class).setReady();
|
||||||
|
|
||||||
|
Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
|
||||||
|
{
|
||||||
|
main.start(this);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start(Jooby jooby) {
|
||||||
|
service.startJooby(jooby);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -2,27 +2,27 @@ package nu.marginalia.assistant;
|
|||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import io.jooby.Context;
|
||||||
|
import io.jooby.Jooby;
|
||||||
import nu.marginalia.assistant.suggest.Suggestions;
|
import nu.marginalia.assistant.suggest.Suggestions;
|
||||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||||
import nu.marginalia.functions.math.MathGrpcService;
|
import nu.marginalia.functions.math.MathGrpcService;
|
||||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.rss.svc.FeedsGrpcService;
|
import nu.marginalia.rss.svc.FeedsGrpcService;
|
||||||
import nu.marginalia.screenshot.ScreenshotService;
|
|
||||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import nu.marginalia.service.server.SparkService;
|
import nu.marginalia.service.server.JoobyService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import spark.Request;
|
|
||||||
import spark.Response;
|
|
||||||
import spark.Spark;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class AssistantService extends SparkService {
|
public class AssistantService extends JoobyService {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
@org.jetbrains.annotations.NotNull
|
||||||
|
private final ScreenshotService screenshotService;
|
||||||
private final Suggestions suggestions;
|
private final Suggestions suggestions;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
|
|||||||
List.of(domainInfoGrpcService,
|
List.of(domainInfoGrpcService,
|
||||||
mathGrpcService,
|
mathGrpcService,
|
||||||
liveCaptureGrpcService,
|
liveCaptureGrpcService,
|
||||||
feedsGrpcService));
|
feedsGrpcService),
|
||||||
|
List.of());
|
||||||
|
this.screenshotService = screenshotService;
|
||||||
|
|
||||||
this.suggestions = suggestions;
|
this.suggestions = suggestions;
|
||||||
|
|
||||||
Spark.staticFiles.expireTime(600);
|
|
||||||
|
|
||||||
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
|
|
||||||
Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
|
|
||||||
|
|
||||||
Spark.awaitInitialization();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object getSuggestions(Request request, Response response) {
|
public void startJooby(Jooby jooby) {
|
||||||
response.type("application/json");
|
super.startJooby(jooby);
|
||||||
var param = request.queryParams("partial");
|
|
||||||
if (param == null) {
|
jooby.get("/suggest/", this::getSuggestions);
|
||||||
|
jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getSuggestions(Context context) {
|
||||||
|
context.setResponseType("application/json");
|
||||||
|
var param = context.query("partial");
|
||||||
|
if (param.isMissing()) {
|
||||||
logger.warn("Bad parameter, partial is null");
|
logger.warn("Bad parameter, partial is null");
|
||||||
Spark.halt(500);
|
context.setResponseCode(500);
|
||||||
|
return "{}";
|
||||||
}
|
}
|
||||||
return suggestions.getSuggestions(10, param);
|
return gson.toJson(suggestions.getSuggestions(10, param.value()));
|
||||||
}
|
|
||||||
|
|
||||||
private String convertToJson(Object o) {
|
|
||||||
return gson.toJson(o);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -0,0 +1,118 @@
|
|||||||
|
package nu.marginalia.assistant;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import io.jooby.Context;
|
||||||
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
public class ScreenshotService {
|
||||||
|
|
||||||
|
private final DbDomainQueries domainQueries;
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
|
||||||
|
this.domainQueries = dbDomainQueries;
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasScreenshot(int domainId) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var ps = conn.prepareStatement("""
|
||||||
|
SELECT TRUE
|
||||||
|
FROM DATA_DOMAIN_SCREENSHOT
|
||||||
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||||
|
WHERE EC_DOMAIN.ID=?
|
||||||
|
""")) {
|
||||||
|
ps.setInt(1, domainId);
|
||||||
|
var rs = ps.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return rs.getBoolean(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.warn("SQL error", ex);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object serveScreenshotRequest(Context context) {
|
||||||
|
if (Strings.isNullOrEmpty(context.path("id").value(""))) {
|
||||||
|
context.setResponseCode(404);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
int id = context.path("id").intValue();
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var ps = conn.prepareStatement("""
|
||||||
|
SELECT CONTENT_TYPE, DATA
|
||||||
|
FROM DATA_DOMAIN_SCREENSHOT
|
||||||
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||||
|
WHERE EC_DOMAIN.ID=?
|
||||||
|
""")) {
|
||||||
|
ps.setInt(1, id);
|
||||||
|
var rsp = ps.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
context.setResponseType(rsp.getString(1));
|
||||||
|
context.setResponseCode(200);
|
||||||
|
context.setResponseHeader("Cache-control", "public,max-age=3600");
|
||||||
|
|
||||||
|
try (var rs = context.responseStream()) {
|
||||||
|
IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.warn("IO error", ex);
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.warn("SQL error", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
context.setResponseType("image/svg+xml");
|
||||||
|
|
||||||
|
var name = domainQueries.getDomain(id).map(Object::toString)
|
||||||
|
.orElse("[Screenshot Not Yet Captured]");
|
||||||
|
|
||||||
|
return """
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
|
<svg
|
||||||
|
xmlns="http://www.w3.org/2000/svg"
|
||||||
|
width="640px"
|
||||||
|
height="480px"
|
||||||
|
viewBox="0 0 640 480"
|
||||||
|
version="1.1">
|
||||||
|
<g>
|
||||||
|
<rect
|
||||||
|
style="fill:#808080"
|
||||||
|
id="rect288"
|
||||||
|
width="595.41992"
|
||||||
|
height="430.01825"
|
||||||
|
x="23.034981"
|
||||||
|
y="27.850344" />
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:100px;fill:#909090;font-family:sans-serif;"
|
||||||
|
x="20"
|
||||||
|
y="120">Placeholder</text>
|
||||||
|
<text
|
||||||
|
xml:space="preserve"
|
||||||
|
style="font-size:32px;fill:#000000;font-family:monospace;"
|
||||||
|
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
|
||||||
|
</g>
|
||||||
|
</svg>
|
||||||
|
""".formatted(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -42,6 +42,8 @@ dependencies {
|
|||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
|
||||||
implementation project(':code:functions:link-graph:api')
|
implementation project(':code:functions:link-graph:api')
|
||||||
|
implementation project(':code:functions:favicon')
|
||||||
|
implementation project(':code:functions:favicon:api')
|
||||||
|
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
implementation project(':code:processes:crawling-process:model')
|
implementation project(':code:processes:crawling-process:model')
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.executor;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.execution.*;
|
import nu.marginalia.execution.*;
|
||||||
|
import nu.marginalia.functions.favicon.FaviconGrpcService;
|
||||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||||
import nu.marginalia.service.server.BaseServiceParams;
|
import nu.marginalia.service.server.BaseServiceParams;
|
||||||
import nu.marginalia.service.server.SparkService;
|
import nu.marginalia.service.server.SparkService;
|
||||||
@@ -24,6 +25,7 @@ public class ExecutorSvc extends SparkService {
|
|||||||
ExecutorCrawlGrpcService executorCrawlGrpcService,
|
ExecutorCrawlGrpcService executorCrawlGrpcService,
|
||||||
ExecutorSideloadGrpcService executorSideloadGrpcService,
|
ExecutorSideloadGrpcService executorSideloadGrpcService,
|
||||||
ExecutorExportGrpcService executorExportGrpcService,
|
ExecutorExportGrpcService executorExportGrpcService,
|
||||||
|
FaviconGrpcService faviconGrpcService,
|
||||||
ExecutionInit executionInit,
|
ExecutionInit executionInit,
|
||||||
ExecutorFileTransferService fileTransferService) throws Exception {
|
ExecutorFileTransferService fileTransferService) throws Exception {
|
||||||
super(params,
|
super(params,
|
||||||
@@ -31,7 +33,8 @@ public class ExecutorSvc extends SparkService {
|
|||||||
List.of(executorGrpcService,
|
List.of(executorGrpcService,
|
||||||
executorCrawlGrpcService,
|
executorCrawlGrpcService,
|
||||||
executorSideloadGrpcService,
|
executorSideloadGrpcService,
|
||||||
executorExportGrpcService)
|
executorExportGrpcService,
|
||||||
|
faviconGrpcService)
|
||||||
);
|
);
|
||||||
|
|
||||||
this.executionInit = executionInit;
|
this.executionInit = executionInit;
|
||||||
|
@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
|
|||||||
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
||||||
graalce is a good distribution choice but it doesn't matter too much.
|
graalce is a good distribution choice but it doesn't matter too much.
|
||||||
|
|
||||||
**Tailwindcss** - Install NPM and run `npm install tailwindcss @tailwindcss/cli`
|
|
||||||
|
|
||||||
## Quick Set up
|
## Quick Set up
|
||||||
|
|
||||||
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
||||||
|
@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
|
|||||||
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
|
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
|
||||||
|
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
pushd $(dirname $0)/..
|
||||||
|
npm install -D tailwindcss@3
|
||||||
|
popd
|
||||||
|
@@ -16,7 +16,8 @@ include 'code:services-application:status-service'
|
|||||||
|
|
||||||
include 'code:functions:math'
|
include 'code:functions:math'
|
||||||
include 'code:functions:math:api'
|
include 'code:functions:math:api'
|
||||||
|
include 'code:functions:favicon'
|
||||||
|
include 'code:functions:favicon:api'
|
||||||
include 'code:functions:domain-info'
|
include 'code:functions:domain-info'
|
||||||
include 'code:functions:domain-info:api'
|
include 'code:functions:domain-info:api'
|
||||||
|
|
||||||
@@ -160,12 +161,12 @@ dependencyResolutionManagement {
|
|||||||
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
|
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
|
||||||
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
|
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
|
||||||
|
|
||||||
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
|
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
|
||||||
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
|
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
|
||||||
|
|
||||||
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
|
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
|
||||||
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
|
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
|
||||||
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
|
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
|
||||||
|
|
||||||
library('notnull','org.jetbrains','annotations').version('24.0.0')
|
library('notnull','org.jetbrains','annotations').version('24.0.0')
|
||||||
|
|
||||||
@@ -239,6 +240,7 @@ dependencyResolutionManagement {
|
|||||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||||
|
|
||||||
|
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
||||||
library('jte','gg.jte','jte').version('3.1.15')
|
library('jte','gg.jte','jte').version('3.1.15')
|
||||||
|
|
||||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||||
|
Reference in New Issue
Block a user