mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
62 Commits
deploy-006
...
deploy-010
Author | SHA1 | Date | |
---|---|---|---|
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 | ||
|
3eac4dd57f | ||
|
4c2810720a | ||
|
8480ba8daa | ||
|
fbba392491 | ||
|
530eb35949 | ||
|
c2dd2175a2 | ||
|
b8581b0f56 | ||
|
2ea34767d8 | ||
|
e9af838231 | ||
|
ae0cad47c4 | ||
|
5fbc8ef998 | ||
|
32c6dd9e6a | ||
|
6ece6a6cfb | ||
|
39cd1c18f8 | ||
|
eb65daaa88 | ||
|
0bebdb6e33 | ||
|
1e50e392c6 | ||
|
fb673de370 | ||
|
eee73ab16c | ||
|
5354e034bf | ||
|
72384ad6ca | ||
|
a2b076f9be | ||
|
c8b0a32c0f | ||
|
f0d74aa3bb | ||
|
74a1f100f4 | ||
|
eb049658e4 | ||
|
db138b2a6f | ||
|
1673fc284c | ||
|
503ea57d5b | ||
|
18ca926c7f | ||
|
db99242db2 | ||
|
2b9d2985ba | ||
|
eeb6ecd711 | ||
|
1f58aeadbf | ||
|
3d68be64da | ||
|
668f3b16ef | ||
|
98a340a0d1 | ||
|
8862100f7e | ||
|
274941f6de |
@@ -5,7 +5,7 @@ plugins {
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
// https://github.com/GoogleContainerTools/jib/issues/3347
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3' apply(false)
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4' apply(false)
|
||||
}
|
||||
|
||||
group 'marginalia'
|
||||
|
@@ -24,58 +24,4 @@ public class LanguageModels {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
|
||||
public static LanguageModelsBuilder builder() {
|
||||
return new LanguageModelsBuilder();
|
||||
}
|
||||
|
||||
public static class LanguageModelsBuilder {
|
||||
private Path termFrequencies;
|
||||
private Path openNLPSentenceDetectionData;
|
||||
private Path posRules;
|
||||
private Path posDict;
|
||||
private Path fasttextLanguageModel;
|
||||
private Path segments;
|
||||
|
||||
LanguageModelsBuilder() {
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posRules(Path posRules) {
|
||||
this.posRules = posRules;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posDict(Path posDict) {
|
||||
this.posDict = posDict;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder segments(Path segments) {
|
||||
this.segments = segments;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModels build() {
|
||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -22,6 +22,7 @@ public class DbDomainQueries {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@@ -59,6 +60,34 @@ public class DbDomainQueries {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||
try {
|
||||
return domainWithNodeCache.get(domain, () -> {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@@ -145,4 +174,6 @@ public class DbDomainQueries {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||
}
|
||||
|
@@ -10,7 +10,9 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
|
||||
/** WorkLog is a journal of work done by a process,
|
||||
@@ -61,6 +63,12 @@ public class WorkLog implements AutoCloseable, Closeable {
|
||||
return new WorkLoadIterable<>(logFile, mapper);
|
||||
}
|
||||
|
||||
public static int countEntries(Path crawlerLog) throws IOException{
|
||||
try (var linesStream = Files.lines(crawlerLog)) {
|
||||
return (int) linesStream.filter(WorkLogEntry::isJobId).count();
|
||||
}
|
||||
}
|
||||
|
||||
// Use synchro over concurrent set to avoid competing writes
|
||||
// - correct is better than fast here, it's sketchy enough to use
|
||||
// a PrintWriter
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.service.ServiceId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.NetworkInterface;
|
||||
import java.util.Enumeration;
|
||||
@@ -115,11 +116,12 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
}
|
||||
}
|
||||
|
||||
public static String getLocalNetworkIP() throws Exception {
|
||||
public static String getLocalNetworkIP() throws IOException {
|
||||
Enumeration<NetworkInterface> nets = NetworkInterface.getNetworkInterfaces();
|
||||
|
||||
while (nets.hasMoreElements()) {
|
||||
NetworkInterface netif = nets.nextElement();
|
||||
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||
if (!netif.isUp() || netif.isLoopback()) {
|
||||
continue;
|
||||
}
|
||||
@@ -127,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||
while (inetAddresses.hasMoreElements()) {
|
||||
InetAddress addr = inetAddresses.nextElement();
|
||||
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||
return addr.getHostAddress();
|
||||
}
|
||||
|
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
@@ -106,9 +107,12 @@ public class JoobyService {
|
||||
config.externalAddress());
|
||||
|
||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
|
||||
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
}
|
||||
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
}
|
||||
var options = new ServerOptions();
|
||||
options.setHost(config.bindAddress());
|
||||
options.setPort(restEndpoint.port());
|
||||
|
@@ -6,25 +6,36 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
import org.eclipse.jetty.servlet.ServletHolder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
try {
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
|
||||
server.start();
|
||||
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||
|
||||
server.start();
|
||||
}
|
||||
catch (Exception|NoSuchMethodError ex) {
|
||||
logger.error("Failed to set up metrics server", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -14,6 +14,8 @@ import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.nodecfg.model.NodeProfile;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
@@ -29,6 +31,7 @@ public class UpdateRssActor extends RecordActorPrototype {
|
||||
|
||||
private final NodeConfigurationService nodeConfigurationService;
|
||||
private final MqPersistence persistence;
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateRssActor.class);
|
||||
|
||||
@Inject
|
||||
public UpdateRssActor(Gson gson,
|
||||
@@ -101,8 +104,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
||||
case UpdateRefresh(int count, long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: message not found");
|
||||
logger.warn("UpdateRefresh is taking a very long time");
|
||||
yield new UpdateRefresh(count, msgId);
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
@@ -119,8 +122,8 @@ public class UpdateRssActor extends RecordActorPrototype {
|
||||
case UpdateClean(long msgId) -> {
|
||||
MqMessage msg = persistence.waitForMessageTerminalState(msgId, Duration.ofSeconds(10), Duration.ofHours(12));
|
||||
if (msg == null) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: message not found");
|
||||
logger.warn("UpdateClean is taking a very long time");
|
||||
yield new UpdateClean(msgId);
|
||||
} else if (msg.state() != MqMessageState.OK) {
|
||||
// Retry the update
|
||||
yield new Error("Failed to update feeds: " + msg.state());
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.io.CrawlerOutputFile;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
@@ -18,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
@@ -26,14 +28,15 @@ import java.util.function.Function;
|
||||
public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
|
||||
private final ServiceHeartbeat serviceHeartbeat;
|
||||
private static final Logger logger = LoggerFactory.getLogger(MigrateCrawlDataActor.class);
|
||||
|
||||
@Inject
|
||||
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService) {
|
||||
public MigrateCrawlDataActor(Gson gson, FileStorageService fileStorageService, ServiceHeartbeat serviceHeartbeat) {
|
||||
super(gson);
|
||||
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.serviceHeartbeat = serviceHeartbeat;
|
||||
}
|
||||
|
||||
public record Run(long fileStorageId) implements ActorStep {}
|
||||
@@ -49,33 +52,50 @@ public class MigrateCrawlDataActor extends RecordActorPrototype {
|
||||
Path crawlerLog = root.resolve("crawler.log");
|
||||
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||
|
||||
try (WorkLog workLog = new WorkLog(newCrawlerLog)) {
|
||||
int totalEntries = WorkLog.countEntries(crawlerLog);
|
||||
|
||||
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Migrating")
|
||||
) {
|
||||
int entryIdx = 0;
|
||||
|
||||
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||
|
||||
var entry = item.getKey();
|
||||
var path = item.getValue();
|
||||
final WorkLogEntry entry = item.getKey();
|
||||
final Path inputPath = item.getValue();
|
||||
|
||||
logger.info("Converting {}", entry.id());
|
||||
Path outputPath = inputPath;
|
||||
heartbeat.progress("Migrating" + inputPath.getFileName(), entryIdx++, totalEntries);
|
||||
|
||||
|
||||
if (path.toFile().getName().endsWith(".parquet")) {
|
||||
if (inputPath.toString().endsWith(".parquet")) {
|
||||
String domain = entry.id();
|
||||
String id = Integer.toHexString(domain.hashCode());
|
||||
|
||||
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||
outputPath = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||
|
||||
SlopCrawlDataRecord.convertFromParquet(path, outputFile);
|
||||
if (Files.exists(inputPath)) {
|
||||
try {
|
||||
SlopCrawlDataRecord.convertFromParquet(inputPath, outputPath);
|
||||
Files.deleteIfExists(inputPath);
|
||||
} catch (Exception ex) {
|
||||
outputPath = inputPath; // don't update the work log on error
|
||||
logger.error("Failed to convert " + inputPath, ex);
|
||||
}
|
||||
}
|
||||
else if (!Files.exists(inputPath) && !Files.exists(outputPath)) {
|
||||
// if the input file is missing, and the output file is missing, we just write the log
|
||||
// record identical to the old one
|
||||
outputPath = inputPath;
|
||||
}
|
||||
}
|
||||
|
||||
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
|
||||
}
|
||||
else {
|
||||
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
|
||||
}
|
||||
// Write a log entry for the (possibly) converted file
|
||||
workLog.setJobToFinished(entry.id(), outputPath.toString(), entry.cnt());
|
||||
}
|
||||
}
|
||||
|
||||
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||
Files.move(crawlerLog, oldCrawlerLog);
|
||||
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.move(newCrawlerLog, crawlerLog);
|
||||
|
||||
yield new End();
|
||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id "com.google.protobuf" version "0.9.4"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
jar.archiveBaseName = 'favicon-api'
|
||||
|
||||
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
@@ -0,0 +1,39 @@
|
||||
package nu.marginalia.api.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public class FaviconClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||
|
||||
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createMulti(
|
||||
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||
FaviconAPIGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public record FaviconData(byte[] bytes, String contentType) {}
|
||||
|
||||
|
||||
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||
.forNode(node)
|
||||
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||
|
||||
if (rsp.getData().isEmpty())
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||
}
|
||||
|
||||
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
||||
syntax="proto3";
|
||||
package marginalia.api.favicon;
|
||||
|
||||
option java_package="nu.marginalia.api.favicon";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service FaviconAPI {
|
||||
/** Fetches information about a domain. */
|
||||
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||
}
|
||||
|
||||
message RpcFaviconRequest {
|
||||
string domain = 1;
|
||||
}
|
||||
|
||||
message RpcFaviconResponse {
|
||||
string domain = 1;
|
||||
bytes data = 2;
|
||||
string contentType = 3;
|
||||
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,44 @@
|
||||
package nu.marginalia.functions.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||
private final DomainStateDb domainStateDb;
|
||||
|
||||
@Inject
|
||||
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||
this.domainStateDb = domainStateDb;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||
|
||||
RpcFaviconResponse response;
|
||||
if (icon.isEmpty()) {
|
||||
response = RpcFaviconResponse.newBuilder().build();
|
||||
}
|
||||
else {
|
||||
var iconRecord = icon.get();
|
||||
response = RpcFaviconResponse.newBuilder()
|
||||
.setContentType(iconRecord.contentType())
|
||||
.setDomain(request.getDomain())
|
||||
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||
.build();
|
||||
}
|
||||
|
||||
responseObserver.onNext(response);
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
}
|
@@ -34,6 +34,7 @@ dependencies {
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.commons.io
|
||||
implementation libs.wiremock
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -12,6 +13,7 @@ import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
/** Client for local browserless.io API */
|
||||
public class BrowserlessClient implements AutoCloseable {
|
||||
@@ -27,13 +29,16 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
private final URI browserlessURI;
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final String userAgent = WmsaHome.getUserAgent().uaString();
|
||||
|
||||
public BrowserlessClient(URI browserlessURI) {
|
||||
this.browserlessURI = browserlessURI;
|
||||
}
|
||||
|
||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
public Optional<String> content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
|
||||
@@ -49,10 +54,10 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
||||
return null;
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return rsp.body();
|
||||
return Optional.of(rsp.body());
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
@@ -60,6 +65,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"userAgent", userAgent,
|
||||
"options", screenshotOptions,
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
@@ -84,7 +90,7 @@ public class BrowserlessClient implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
public void close() {
|
||||
httpClient.shutdownNow();
|
||||
}
|
||||
|
||||
|
@@ -1,5 +1,9 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -8,34 +12,86 @@ import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.Map;
|
||||
|
||||
import static com.github.tomakehurst.wiremock.client.WireMock.*;
|
||||
|
||||
|
||||
@Testcontainers
|
||||
@Tag("slow")
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome"))
|
||||
.withEnv(Map.of("TOKEN", "BROWSERLESS_TOKEN"))
|
||||
.withNetworkMode("bridge")
|
||||
.withExposedPorts(3000);
|
||||
|
||||
static WireMockServer wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
static String localIp;
|
||||
|
||||
static URI browserlessURI;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
public static void setup() throws IOException {
|
||||
container.start();
|
||||
|
||||
browserlessURI = URI.create(String.format("http://%s:%d/",
|
||||
container.getHost(),
|
||||
container.getMappedPort(3000))
|
||||
);
|
||||
|
||||
wireMockServer.start();
|
||||
wireMockServer.stubFor(get("/").willReturn(aResponse().withStatus(200).withBody("Ok")));
|
||||
|
||||
localIp = ServiceConfigurationModule.getLocalNetworkIP();
|
||||
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
@Test
|
||||
public void testInspectContentUA__Flaky() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
client.content("http://" + localIp + ":18089/",
|
||||
BrowserlessClient.GotoOptions.defaultValues()
|
||||
);
|
||||
}
|
||||
|
||||
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
@Test
|
||||
public void testInspectScreenshotUA__Flaky() throws Exception {
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
client.screenshot("http://" + localIp + ":18089/",
|
||||
BrowserlessClient.GotoOptions.defaultValues(),
|
||||
BrowserlessClient.ScreenshotOptions.defaultValues()
|
||||
);
|
||||
}
|
||||
|
||||
wireMockServer.verify(getRequestedFor(urlEqualTo("/")).withHeader("User-Agent", equalTo(WmsaHome.getUserAgent().uaString())));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContent() throws Exception {
|
||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
|
||||
Assertions.assertNotNull(content, "Content should not be null");
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()).orElseThrow();
|
||||
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||
try (var client = new BrowserlessClient(browserlessURI)) {
|
||||
var screenshot = client.screenshot("https://www.marginalia.nu/",
|
||||
BrowserlessClient.GotoOptions.defaultValues(),
|
||||
BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||
|
||||
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
||||
}
|
||||
}
|
||||
|
@@ -134,6 +134,10 @@ public class QueryExpansion {
|
||||
if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) {
|
||||
graph.addVariantForSpan(prev, qw, joinedWord);
|
||||
}
|
||||
else if (StringUtils.isAlpha(prev.word()) && StringUtils.isNumeric(qw.word())) { // join e.g. trs 80 to trs80 and trs-80
|
||||
graph.addVariantForSpan(prev, qw, prev.word() + qw.word());
|
||||
graph.addVariantForSpan(prev, qw, prev.word() + "-" + qw.word());
|
||||
}
|
||||
}
|
||||
|
||||
prev = qw;
|
||||
|
@@ -213,6 +213,18 @@ public class QueryFactoryTest {
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testContractionWordNum() {
|
||||
var subquery = parseAndGetSpecs("glove 80");
|
||||
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove "));
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" 80 "));
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove-80 "));
|
||||
Assertions.assertTrue(subquery.query.compiledQuery.contains(" glove80 "));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCplusPlus() {
|
||||
var subquery = parseAndGetSpecs("std::vector::push_back vector");
|
||||
|
@@ -155,8 +155,15 @@ public class SentenceExtractor {
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
|
||||
// Normalize spaces
|
||||
// Safety net against malformed data DOS attacks,
|
||||
// found 5+ MB <p>-tags in the wild that just break
|
||||
// the sentence extractor causing it to stall forever.
|
||||
if (text.length() > 50_000) {
|
||||
// 50k chars can hold a small novel, let alone single html tags
|
||||
text = text.substring(0, 50_000);
|
||||
}
|
||||
|
||||
// Normalize spaces
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
@@ -5,9 +5,7 @@ import nu.marginalia.actor.state.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
public abstract class RecordActorPrototype implements ActorPrototype {
|
||||
|
||||
@@ -118,7 +116,7 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
||||
}
|
||||
|
||||
private String functionName(Class<? extends ActorStep> functionClass) {
|
||||
return functionClass.getSimpleName().toUpperCase();
|
||||
return ActorStep.functionName(functionClass);
|
||||
}
|
||||
|
||||
private ActorStep constructState(String message) throws ReflectiveOperationException {
|
||||
@@ -145,4 +143,43 @@ public abstract class RecordActorPrototype implements ActorPrototype {
|
||||
}
|
||||
}
|
||||
|
||||
/** Get a list of JSON prototypes for each actor step declared by this actor */
|
||||
@SuppressWarnings("unchecked")
|
||||
public Map<String, String> getMessagePrototypes() {
|
||||
Map<String, String> messagePrototypes = new HashMap<>();
|
||||
|
||||
for (var clazz : getClass().getDeclaredClasses()) {
|
||||
if (!clazz.isRecord() || !ActorStep.class.isAssignableFrom(clazz))
|
||||
continue;
|
||||
|
||||
StringJoiner sj = new StringJoiner(",\n\t", "{\n\t", "\n}");
|
||||
|
||||
renderToJsonPrototype(sj, (Class<? extends Record>) clazz);
|
||||
|
||||
messagePrototypes.put(ActorStep.functionName((Class<? extends ActorStep>) clazz), sj.toString());
|
||||
}
|
||||
|
||||
return messagePrototypes;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void renderToJsonPrototype(StringJoiner sj, Class<? extends Record> recordType) {
|
||||
for (var field : recordType.getDeclaredFields()) {
|
||||
String typeName = field.getType().getSimpleName();
|
||||
|
||||
if ("List".equals(typeName)) {
|
||||
sj.add(String.format("\"%s\": [ ]", field.getName()));
|
||||
}
|
||||
else if (field.getType().isRecord()) {
|
||||
var innerSj = new StringJoiner(",", "{", "}");
|
||||
renderToJsonPrototype(innerSj, (Class<? extends Record>) field.getType());
|
||||
sj.add(String.format("\"%s\": %s", field.getName(), sj));
|
||||
}
|
||||
else {
|
||||
sj.add(String.format("\"%s\": \"%s\"", field.getName(), typeName));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,3 +1,7 @@
|
||||
package nu.marginalia.actor.state;
|
||||
|
||||
public interface ActorStep {}
|
||||
public interface ActorStep {
|
||||
static String functionName(Class<? extends ActorStep> type) {
|
||||
return type.getSimpleName().toUpperCase();
|
||||
}
|
||||
}
|
||||
|
@@ -12,6 +12,7 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.converting.writer.ConverterWriter;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessConfiguration;
|
||||
@@ -34,6 +35,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
@@ -49,6 +51,7 @@ public class ConverterMain extends ProcessMainClass {
|
||||
private final ProcessHeartbeat heartbeat;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final SideloadSourceFactory sideloadSourceFactory;
|
||||
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
|
||||
@@ -199,12 +202,26 @@ public class ConverterMain extends ProcessMainClass {
|
||||
processedDomains.set(batchingWorkLog.size());
|
||||
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
||||
|
||||
for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||
logger.info("Processing small items");
|
||||
|
||||
// We separate the large and small domains to reduce the number of critical sections,
|
||||
// as the large domains have a separate processing track that doesn't store everything
|
||||
// in memory
|
||||
|
||||
final List<Path> bigTasks = new ArrayList<>();
|
||||
|
||||
// First process the small items
|
||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||
{
|
||||
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
||||
bigTasks.add(dataPath);
|
||||
continue;
|
||||
}
|
||||
|
||||
pool.submit(() -> {
|
||||
try {
|
||||
ConverterBatchWritableIf writable = processor.createWritable(domain);
|
||||
try (var dataStream = SerializableCrawlDataStream.openDataStream(dataPath)) {
|
||||
ConverterBatchWritableIf writable = processor.fullProcessing(dataStream) ;
|
||||
converterWriter.accept(writable);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -223,6 +240,35 @@ public class ConverterMain extends ProcessMainClass {
|
||||
do {
|
||||
System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
|
||||
} while (!pool.awaitTermination(60, TimeUnit.SECONDS));
|
||||
|
||||
logger.info("Processing large items");
|
||||
|
||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
||||
int bigTaskIdx = 0;
|
||||
// Next the big items domain-by-domain
|
||||
for (var dataPath : bigTasks) {
|
||||
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
|
||||
|
||||
try {
|
||||
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
||||
// closed before it's consumed by the converterWriter. Instead, the converterWriter guarantees it
|
||||
// will close it after it's consumed.
|
||||
|
||||
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
||||
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
|
||||
|
||||
converterWriter.accept(writable);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error in processing", ex);
|
||||
}
|
||||
finally {
|
||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Processing complete");
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -14,7 +14,6 @@ import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.geoip.GeoIpDictionary;
|
||||
import nu.marginalia.geoip.sources.AsnTable;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@@ -28,13 +27,11 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DomainProcessor {
|
||||
private static final int SIDELOAD_THRESHOLD = Integer.getInteger("converter.sideloadThreshold", 10_000);
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final SiteWords siteWords;
|
||||
private final AnchorTagsSource anchorTagsSource;
|
||||
@@ -56,21 +53,6 @@ public class DomainProcessor {
|
||||
geoIpDictionary.waitReady();
|
||||
}
|
||||
|
||||
public ConverterBatchWritableIf createWritable(Path path) throws IOException {
|
||||
|
||||
var dataStream = CrawledDomainReader.createDataStream(path);
|
||||
|
||||
final int sizeHint = dataStream.sizeHint();
|
||||
|
||||
if (sizeHint > SIDELOAD_THRESHOLD) {
|
||||
// If the file is too big, we run a processing mode that doesn't
|
||||
// require loading the entire dataset into RAM
|
||||
return simpleProcessing(dataStream, sizeHint);
|
||||
}
|
||||
|
||||
return fullProcessing(dataStream);
|
||||
}
|
||||
|
||||
public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||
try {
|
||||
return new SimpleProcessing(dataStream, sizeHint, extraKeywords);
|
||||
@@ -159,6 +141,7 @@ public class DomainProcessor {
|
||||
private final Set<String> processedUrls = new HashSet<>();
|
||||
private final DomainLinks externalDomainLinks;
|
||||
private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator();
|
||||
|
||||
private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8,
|
||||
Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors())
|
||||
);
|
||||
@@ -194,6 +177,7 @@ public class DomainProcessor {
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
return iteratorFactory.create((taskConsumer) -> {
|
||||
|
||||
while (dataStream.hasNext())
|
||||
{
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
|
@@ -116,7 +116,7 @@ public class AdblockSimulator {
|
||||
|
||||
|
||||
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||
// This version is about 100x faster than the a "clean" first stab implementation.
|
||||
// This version is about 100x faster than a "clean" first stab implementation.
|
||||
|
||||
class RuleVisitor implements NodeFilter {
|
||||
public boolean sawAds;
|
||||
|
@@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
|
||||
|
||||
var tags = doc.select("meta[name=generator]");
|
||||
|
||||
if (tags.size() == 0) {
|
||||
if (tags.isEmpty()) {
|
||||
// Some sites have a comment in the head instead of a meta tag
|
||||
return fingerprintServerTech(doc, responseHeaders);
|
||||
}
|
||||
|
@@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
}
|
||||
fullHtml.append("</div></body></html>");
|
||||
|
||||
var doc = sideloaderProcessing
|
||||
return sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
fullHtml.toString(),
|
||||
List.of("encyclopedia", "wiki"),
|
||||
@@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
||||
LocalDate.now().getYear(),
|
||||
10_000_000);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
private String normalizeUtf8(String url) {
|
||||
|
@@ -39,6 +39,9 @@ public class ConverterWriter implements AutoCloseable {
|
||||
workerThread.start();
|
||||
}
|
||||
|
||||
/** Queue and eventually write the domain into the converter journal
|
||||
* The domain object will be closed after it's processed.
|
||||
* */
|
||||
public void accept(@Nullable ConverterBatchWritableIf domain) {
|
||||
if (null == domain)
|
||||
return;
|
||||
@@ -72,15 +75,15 @@ public class ConverterWriter implements AutoCloseable {
|
||||
|
||||
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
|
||||
logger.warn("Skipping already logged item {}", id);
|
||||
}
|
||||
else {
|
||||
currentWriter.write(data);
|
||||
workLog.logItem(id);
|
||||
data.close();
|
||||
continue;
|
||||
}
|
||||
|
||||
currentWriter.write(data);
|
||||
|
||||
workLog.logItem(id);
|
||||
|
||||
switcher.tick();
|
||||
data.close();
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
import nu.marginalia.slop.column.string.StringColumn;
|
||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
// Basic information
|
||||
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
||||
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
|
||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
public static class KeywordsProjectionReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final IntColumn.Reader htmlFeaturesReader;
|
||||
private final LongColumn.Reader domainMetadataReader;
|
||||
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class MetadataReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final TxtStringColumn.Reader urlsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader urlsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final StringColumn.Reader titlesReader;
|
||||
private final StringColumn.Reader descriptionsReader;
|
||||
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class Writer extends SlopTable {
|
||||
private final TxtStringColumn.Writer domainsWriter;
|
||||
private final TxtStringColumn.Writer urlsWriter;
|
||||
private final StringColumn.Writer domainsWriter;
|
||||
private final StringColumn.Writer urlsWriter;
|
||||
private final VarintColumn.Writer ordinalsWriter;
|
||||
private final EnumColumn.Writer statesWriter;
|
||||
private final StringColumn.Writer stateReasonsWriter;
|
||||
|
@@ -26,7 +26,7 @@ public class DocumentBodyToString {
|
||||
return new String(data, charset);
|
||||
}
|
||||
|
||||
public static Document getParsedData(ContentType type, byte[] data, String url) throws IOException {
|
||||
public static Document getParsedData(ContentType type, byte[] data, int maxLength, String url) throws IOException {
|
||||
final Charset charset;
|
||||
|
||||
if (type.charset() == null || type.charset().isBlank()) {
|
||||
@@ -35,7 +35,7 @@ public class DocumentBodyToString {
|
||||
charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset);
|
||||
}
|
||||
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(data);
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(data, 0, Math.min(data.length, maxLength));
|
||||
|
||||
return Jsoup.parse(bais, charset.name(), url);
|
||||
}
|
||||
|
@@ -19,7 +19,6 @@ import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
||||
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.CrawlerOutputFile;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
@@ -28,13 +27,11 @@ import nu.marginalia.process.ProcessConfigurationModule;
|
||||
import nu.marginalia.process.ProcessMainClass;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -44,11 +41,13 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Function;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||
|
||||
@@ -182,8 +181,6 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
|
||||
// to be crawled.
|
||||
|
||||
performMigration(outputDir);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var assignFreeDomains = conn.prepareStatement(
|
||||
"""
|
||||
@@ -251,9 +248,14 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// (this happens when the process is restarted after a crash or a shutdown)
|
||||
tasksDone.set(workLog.countFinishedJobs());
|
||||
|
||||
// Create crawl tasks and submit them to the pool for execution
|
||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||
List<CrawlTask> taskList = new ArrayList<>();
|
||||
|
||||
// Create crawl tasks
|
||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
var task = new CrawlTask(
|
||||
@@ -264,11 +266,22 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
domainStateDb,
|
||||
workLog);
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||
pool.submitQuietly(task);
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
|
||||
pool.shutDown();
|
||||
@@ -293,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||
* or if it can be discarded */
|
||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||
if (!task.canRun()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
try {
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
catch (RuntimeException ex) {
|
||||
logger.error("Failed to submit task " + task.domain, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||
}
|
||||
@@ -349,9 +384,20 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.id = Integer.toHexString(domain.hashCode());
|
||||
}
|
||||
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
|
||||
if (workLog.isJobFinished(domain)) { // No-Op
|
||||
logger.info("Omitting task {}, as it is already run", domain);
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
@@ -406,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
@@ -417,11 +463,22 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private CrawlDataReference getReference() {
|
||||
try {
|
||||
return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id));
|
||||
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
||||
if (Files.exists(slopPath)) {
|
||||
return new CrawlDataReference(slopPath);
|
||||
}
|
||||
|
||||
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
||||
if (Files.exists(parquetPath)) {
|
||||
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
||||
return new CrawlDataReference(slopPath);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||
return new CrawlDataReference();
|
||||
}
|
||||
|
||||
return new CrawlDataReference();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -482,92 +539,19 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
// Data migration logic
|
||||
|
||||
private void performMigration(Path root) throws IOException {
|
||||
Path crawlerLog = root.resolve("crawler.log");
|
||||
Path newCrawlerLog = Files.createTempFile(root, "crawler", ".migrate.log");
|
||||
|
||||
|
||||
int finishedTasks = 0;
|
||||
int totalTasks;
|
||||
try (var oldLog = new WorkLog(crawlerLog)) {
|
||||
totalTasks = oldLog.countFinishedJobs();
|
||||
// Migrate from parquet to slop if necessary
|
||||
//
|
||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||
if (!inputPath.toString().endsWith(".parquet")) {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
try (WorkLog workLog = new WorkLog(newCrawlerLog);
|
||||
var migrationHeartbeat = heartbeat.createAdHocTaskHeartbeat("MIGRATING")) {
|
||||
Path outputFile = CrawlerOutputFile.createSlopPath(crawlDataRoot, Integer.toHexString(domain.hashCode()), domain);
|
||||
|
||||
SlopCrawlDataRecord.convertFromParquet(inputPath, outputFile);
|
||||
|
||||
|
||||
for (Map.Entry<WorkLogEntry, Path> item : WorkLog.iterableMap(crawlerLog, new CrawlDataLocator(root))) {
|
||||
|
||||
var entry = item.getKey();
|
||||
var path = item.getValue();
|
||||
|
||||
if (path.toFile().getName().endsWith(".parquet")) {
|
||||
logger.info("Converting {}", entry.id());
|
||||
|
||||
String domain = entry.id();
|
||||
String id = Integer.toHexString(domain.hashCode());
|
||||
|
||||
Path outputFile = CrawlerOutputFile.createSlopPath(root, id, domain);
|
||||
|
||||
SlopCrawlDataRecord.convertFromParquet(path, outputFile);
|
||||
|
||||
workLog.setJobToFinished(entry.id(), outputFile.toString(), entry.cnt());
|
||||
}
|
||||
else {
|
||||
workLog.setJobToFinished(entry.id(), path.toString(), entry.cnt());
|
||||
}
|
||||
|
||||
migrationHeartbeat.progress("Parquet To Slop", ++finishedTasks, totalTasks);
|
||||
}
|
||||
}
|
||||
|
||||
Path oldCrawlerLog = Files.createTempFile(root, "crawler-", ".migrate.old.log");
|
||||
Files.move(crawlerLog, oldCrawlerLog, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.move(newCrawlerLog, crawlerLog);
|
||||
}
|
||||
|
||||
|
||||
private static class CrawlDataLocator implements Function<WorkLogEntry, Optional<Map.Entry<WorkLogEntry, Path>>> {
|
||||
|
||||
private final Path crawlRootDir;
|
||||
|
||||
CrawlDataLocator(Path crawlRootDir) {
|
||||
this.crawlRootDir = crawlRootDir;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Map.Entry<WorkLogEntry, Path>> apply(WorkLogEntry entry) {
|
||||
var path = getCrawledFilePath(crawlRootDir, entry.path());
|
||||
|
||||
if (!Files.exists(path)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
try {
|
||||
return Optional.of(Map.entry(entry, path));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private Path getCrawledFilePath(Path crawlDir, String fileName) {
|
||||
int sp = fileName.lastIndexOf('/');
|
||||
|
||||
// Normalize the filename
|
||||
if (sp >= 0 && sp + 1< fileName.length())
|
||||
fileName = fileName.substring(sp + 1);
|
||||
if (fileName.length() < 4)
|
||||
fileName = Strings.repeat("0", 4 - fileName.length()) + fileName;
|
||||
|
||||
String sp1 = fileName.substring(0, 2);
|
||||
String sp2 = fileName.substring(2, 4);
|
||||
return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName);
|
||||
}
|
||||
return outputFile;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,5 +1,8 @@
|
||||
package nu.marginalia.crawl;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -9,6 +12,7 @@ import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
/** Supplemental sqlite database for storing the summary of a crawl.
|
||||
@@ -60,6 +64,25 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||
|
||||
@Inject
|
||||
public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
|
||||
this(findFilename(fileStorageService));
|
||||
}
|
||||
|
||||
private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
|
||||
var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
|
||||
|
||||
if (fsId.isPresent()) {
|
||||
var fs = fileStorageService.getStorage(fsId.get());
|
||||
return fs.asPath().resolve("domainstate.db");
|
||||
}
|
||||
else {
|
||||
throw new SQLException("Could not find crawl data storage");
|
||||
}
|
||||
}
|
||||
|
||||
public DomainStateDb(Path filename) throws SQLException {
|
||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
@@ -74,7 +97,13 @@ public class DomainStateDb implements AutoCloseable {
|
||||
feedUrl TEXT
|
||||
)
|
||||
""");
|
||||
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS favicon (
|
||||
domain TEXT PRIMARY KEY,
|
||||
contentType TEXT NOT NULL,
|
||||
icon BLOB NOT NULL
|
||||
)
|
||||
""");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
}
|
||||
@@ -85,6 +114,41 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||
VALUES(?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, Objects.requireNonNullElse(faviconRecord.contentType, "application/octet-stream"));
|
||||
stmt.setBytes(3, faviconRecord.imageData);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to insert favicon", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<FaviconRecord> getIcon(String domain) {
|
||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return Optional.of(
|
||||
new FaviconRecord(
|
||||
rs.getString("contentType"),
|
||||
rs.getBytes("icon")
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to retrieve favicon", e);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||
|
@@ -23,12 +23,10 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.http.HttpTimeoutException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
@@ -47,6 +45,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
|
||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
@@ -109,23 +108,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.timeout(probeTimeout)
|
||||
.build();
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
|
||||
try {
|
||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
for (int tries = 0;; tries++) {
|
||||
try {
|
||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||
}
|
||||
return new DomainProbeResult.Ok(rspUri);
|
||||
} catch (Exception ex) {
|
||||
if (tries > 3) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
}
|
||||
// else try again ...
|
||||
}
|
||||
return new DomainProbeResult.Ok(rspUri);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,7 +148,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
var headBuilder = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
@@ -217,7 +220,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
var getBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept-Language", "en,*;q=0.5")
|
||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
@@ -248,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
return new SitemapRetriever();
|
||||
}
|
||||
|
||||
/** Recursively fetch sitemaps */
|
||||
@Override
|
||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||
try {
|
||||
@@ -267,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||
var head = sitemapQueue.removeFirst();
|
||||
|
||||
switch (fetchSitemap(head)) {
|
||||
switch (fetchSingleSitemap(head)) {
|
||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||
|
||||
for (var url : urls) {
|
||||
@@ -303,13 +307,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-agent", userAgentString)
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
|
||||
@@ -388,7 +392,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.uri(url.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-agent", userAgentString)
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout);
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||
|
@@ -96,7 +96,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
try {
|
||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||
}
|
||||
catch (IOException ex) {
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
else
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
@@ -4,6 +4,7 @@ import nu.marginalia.ContentTypes;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -11,51 +12,73 @@ import javax.annotation.Nullable;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
/** A reference to a domain that has been crawled before. */
|
||||
public class CrawlDataReference implements AutoCloseable {
|
||||
public class CrawlDataReference implements AutoCloseable, Iterable<CrawledDocument> {
|
||||
|
||||
private boolean closed = false;
|
||||
|
||||
@Nullable
|
||||
private final Path path;
|
||||
|
||||
@Nullable
|
||||
private SerializableCrawlDataStream data = null;
|
||||
|
||||
private final SerializableCrawlDataStream data;
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||
|
||||
public CrawlDataReference(SerializableCrawlDataStream data) {
|
||||
this.data = data;
|
||||
public CrawlDataReference(@Nullable Path path) {
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public CrawlDataReference() {
|
||||
this(SerializableCrawlDataStream.empty());
|
||||
this(null);
|
||||
}
|
||||
|
||||
/** Delete the associated data from disk, if it exists */
|
||||
public void delete() throws IOException {
|
||||
Path filePath = data.path();
|
||||
|
||||
if (filePath != null) {
|
||||
Files.deleteIfExists(filePath);
|
||||
if (path != null) {
|
||||
Files.deleteIfExists(path);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the next document from the crawl data,
|
||||
* returning null when there are no more documents
|
||||
* available
|
||||
*/
|
||||
@Nullable
|
||||
public CrawledDocument nextDocument() {
|
||||
try {
|
||||
while (data.hasNext()) {
|
||||
if (data.next() instanceof CrawledDocument doc) {
|
||||
if (!ContentTypes.isAccepted(doc.contentType))
|
||||
continue;
|
||||
public @NotNull Iterator<CrawledDocument> iterator() {
|
||||
|
||||
return doc;
|
||||
requireStream();
|
||||
// Guaranteed by requireStream, but helps java
|
||||
Objects.requireNonNull(data);
|
||||
|
||||
return data.map(next -> {
|
||||
if (next instanceof CrawledDocument doc && ContentTypes.isAccepted(doc.contentType)) {
|
||||
return Optional.of(doc);
|
||||
}
|
||||
else {
|
||||
return Optional.empty();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/** After calling this method, data is guaranteed to be non-null */
|
||||
private void requireStream() {
|
||||
if (closed) {
|
||||
throw new IllegalStateException("Use after close()");
|
||||
}
|
||||
|
||||
if (data == null) {
|
||||
try {
|
||||
if (path != null) {
|
||||
data = SerializableCrawlDataStream.openDataStream(path);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read next document", ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to open stream", ex);
|
||||
}
|
||||
|
||||
return null;
|
||||
data = SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean isContentBodySame(byte[] one, byte[] other) {
|
||||
@@ -98,7 +121,12 @@ public class CrawlDataReference implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
data.close();
|
||||
public void close() throws IOException {
|
||||
if (!closed) {
|
||||
if (data != null) {
|
||||
data.close();
|
||||
}
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -19,7 +19,6 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -90,30 +89,45 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
try {
|
||||
try (oldCrawlData) {
|
||||
// Do an initial domain probe to determine the root URL
|
||||
EdgeUrl rootUrl;
|
||||
|
||||
var probeResult = probeRootUrl();
|
||||
switch (probeResult) {
|
||||
|
||||
return switch (probeResult) {
|
||||
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
||||
rootUrl = probedUrl; // Good track
|
||||
|
||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||
// so we don't know the crawl delay
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
|
||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||
|
||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
||||
}
|
||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||
return 1;
|
||||
yield 1;
|
||||
}
|
||||
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||
return 1;
|
||||
yield 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||
// so we don't know the crawl delay
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
|
||||
return crawlDomain(oldCrawlData, rootUrl, domainLinks);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error crawling domain {}", domain, ex);
|
||||
@@ -121,28 +135,15 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
private int crawlDomain(CrawlDataReference oldCrawlData,
|
||||
EdgeUrl rootUrl,
|
||||
DomainLinks domainLinks) throws InterruptedException {
|
||||
private int crawlDomain(EdgeUrl rootUrl,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer,
|
||||
DomainLinks domainLinks) {
|
||||
|
||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
|
||||
// Add external links to the crawl frontier
|
||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||
|
||||
|
||||
// Fetch sitemaps
|
||||
for (var sitemap : robotsRules.getSitemaps()) {
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
@@ -273,7 +274,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||
|
||||
// Grab the favicon if it exists
|
||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
|
||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
}
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
}
|
||||
@@ -371,8 +381,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(top, doc);
|
||||
crawlFrontier.addVisited(new EdgeUrl(ok.uri()));
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
|
@@ -40,18 +40,12 @@ public class CrawlerRevisitor {
|
||||
int errors = 0;
|
||||
int skipped = 0;
|
||||
|
||||
for (;;) {
|
||||
for (CrawledDocument doc : oldCrawlData) {
|
||||
if (errors > 20) {
|
||||
// If we've had too many errors, we'll stop trying to recrawl
|
||||
break;
|
||||
}
|
||||
|
||||
CrawledDocument doc = oldCrawlData.nextDocument();
|
||||
|
||||
if (doc == null)
|
||||
break;
|
||||
|
||||
// This Shouldn't Happen (TM)
|
||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||
if (urlMaybe.isEmpty())
|
||||
continue;
|
||||
|
@@ -1,55 +0,0 @@
|
||||
package nu.marginalia.io;
|
||||
|
||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
||||
|
||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
else if (fileName.endsWith(".slop.zip")) {
|
||||
try {
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.error("Unknown file type: {}", fullPath);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */
|
||||
public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException {
|
||||
Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain);
|
||||
|
||||
if (Files.exists(parquetPath)) {
|
||||
return createDataStream(parquetPath);
|
||||
}
|
||||
else {
|
||||
throw new FileNotFoundException("No such file: " + parquetPath);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -35,19 +35,6 @@ public class CrawlerOutputFile {
|
||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + "-" + version.suffix + ".warc.gz");
|
||||
}
|
||||
|
||||
public static Path createParquetPath(Path basePath, String id, String domain) throws IOException {
|
||||
id = padId(id);
|
||||
|
||||
String first = id.substring(0, 2);
|
||||
String second = id.substring(2, 4);
|
||||
|
||||
Path destDir = basePath.resolve(first).resolve(second);
|
||||
if (!Files.exists(destDir)) {
|
||||
Files.createDirectories(destDir);
|
||||
}
|
||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
|
||||
}
|
||||
|
||||
public static Path createSlopPath(Path basePath, String id, String domain) throws IOException {
|
||||
id = padId(id);
|
||||
|
||||
@@ -71,16 +58,17 @@ public class CrawlerOutputFile {
|
||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".parquet");
|
||||
}
|
||||
|
||||
public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) {
|
||||
public static Path getSlopPath(Path basePath, String id, String domain) {
|
||||
id = padId(id);
|
||||
|
||||
String first = id.substring(0, 2);
|
||||
String second = id.substring(2, 4);
|
||||
|
||||
Path destDir = basePath.resolve(first).resolve(second);
|
||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".warc" + version.suffix);
|
||||
return destDir.resolve(id + "-" + filesystemSafeName(domain) + ".slop.zip");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pads the given ID with leading zeros to ensure it has a length of 4 characters.
|
||||
*/
|
||||
|
@@ -1,5 +1,7 @@
|
||||
package nu.marginalia.io;
|
||||
|
||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.io.crawldata.format.SlopSerializableCrawlDataStream;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
@@ -18,7 +20,6 @@ import java.util.function.Function;
|
||||
/** Closable iterator exceptional over serialized crawl data
|
||||
* The data may appear in any order, and the iterator must be closed.
|
||||
*
|
||||
* @see CrawledDomainReader
|
||||
* */
|
||||
public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||
Logger logger = LoggerFactory.getLogger(SerializableCrawlDataStream.class);
|
||||
@@ -27,13 +28,60 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||
|
||||
/** Return a size hint for the stream. 0 is returned if the hint is not available,
|
||||
* or if the file is seemed too small to bother */
|
||||
default int sizeHint() { return 0; }
|
||||
default int getSizeHint() { return 0; }
|
||||
|
||||
boolean hasNext() throws IOException;
|
||||
|
||||
@Nullable
|
||||
default Path path() { return null; }
|
||||
|
||||
void close() throws IOException;
|
||||
|
||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||
static SerializableCrawlDataStream openDataStream(Path fullPath) throws IOException
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
|
||||
if (fileName.endsWith(".slop.zip")) {
|
||||
try {
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
else if (fileName.endsWith(".parquet")) {
|
||||
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
logger.error("Unknown file type: {}", fullPath);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
|
||||
/** Get an idication of the size of the stream. This is used to determine whether to
|
||||
* load the stream into memory or not. 0 is returned if the hint is not available,
|
||||
* or if the file is seemed too small to bother */
|
||||
static int getSizeHint(Path fullPath) {
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
return ParquetSerializableCrawlDataStream.sizeHint(fullPath);
|
||||
}
|
||||
else if (fileName.endsWith(".slop.zip")) {
|
||||
return SlopSerializableCrawlDataStream.sizeHint(fullPath);
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
default <T> Iterator<T> map(Function<SerializableCrawlData, Optional<T>> mapper) {
|
||||
return new Iterator<>() {
|
||||
T next = null;
|
||||
|
@@ -40,7 +40,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
return path;
|
||||
}
|
||||
|
||||
public int sizeHint() {
|
||||
public static int sizeHint(Path path) {
|
||||
// Only calculate size hint for large files
|
||||
// (the reason we calculate them in the first place is to assess whether it is large
|
||||
// because it has many documents, or because it is a small number of large documents)
|
||||
|
@@ -52,7 +52,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
return path;
|
||||
}
|
||||
|
||||
public int sizeHint() {
|
||||
public static int sizeHint(Path path) {
|
||||
// Only calculate size hint for large files
|
||||
// (the reason we calculate them in the first place is to assess whether it is large
|
||||
// because it has many documents, or because it is a small number of large documents)
|
||||
|
@@ -12,7 +12,7 @@ import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
*/
|
||||
@@ -58,13 +58,27 @@ public sealed interface HttpFetchResult {
|
||||
int statusCode,
|
||||
HttpHeaders headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw,
|
||||
byte[] bytesRaw, // raw data for the entire response including headers
|
||||
int bytesStart,
|
||||
int bytesLength
|
||||
) implements HttpFetchResult {
|
||||
|
||||
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
||||
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||
}
|
||||
|
||||
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||
|
||||
inputMap.forEach((k, v) -> {
|
||||
if (k.isBlank()) return;
|
||||
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||
|
||||
filteredMap.put(k, v);
|
||||
});
|
||||
|
||||
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||
}
|
||||
|
||||
public boolean isOk() {
|
||||
@@ -75,6 +89,12 @@ public sealed interface HttpFetchResult {
|
||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||
}
|
||||
|
||||
/** Copy the byte range corresponding to the payload of the response,
|
||||
Warning: Copies the data, use getInputStream() for zero copy access */
|
||||
public byte[] getBodyBytes() {
|
||||
return Arrays.copyOfRange(bytesRaw, bytesStart, bytesStart + bytesLength);
|
||||
}
|
||||
|
||||
public Optional<Document> parseDocument() {
|
||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||
if (contentType.is("text/html")) {
|
||||
|
@@ -59,9 +59,12 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
}
|
||||
|
||||
public Document parseBody() throws IOException {
|
||||
// Prevent stalls from parsing excessively large documents
|
||||
|
||||
return DocumentBodyToString.getParsedData(
|
||||
ContentType.parse(contentType),
|
||||
documentBodyBytes,
|
||||
200_000,
|
||||
url);
|
||||
}
|
||||
|
||||
|
@@ -108,15 +108,17 @@ public record SlopCrawlDataRecord(String domain,
|
||||
public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
|
||||
Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
|
||||
|
||||
try (var writer = new Writer(tempDir)) {
|
||||
CrawledDocumentParquetRecordFileReader.stream(parquetInput).forEach(
|
||||
parquetRecord -> {
|
||||
try {
|
||||
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
try (var writer = new Writer(tempDir);
|
||||
var stream = CrawledDocumentParquetRecordFileReader.stream(parquetInput))
|
||||
{
|
||||
stream.forEach(
|
||||
parquetRecord -> {
|
||||
try {
|
||||
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (IOException ex) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
|
@@ -10,7 +10,7 @@ import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomainStateDbTest {
|
||||
|
||||
@@ -26,7 +26,7 @@ class DomainStateDbTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSunnyDay() throws SQLException {
|
||||
public void testSummaryRecord() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
var allFields = new DomainStateDb.SummaryRecord(
|
||||
"all.marginalia.nu",
|
||||
@@ -63,4 +63,21 @@ class DomainStateDbTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFavicon() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
db.saveIcon("www.marginalia.nu", new DomainStateDb.FaviconRecord("text/plain", "hello world".getBytes()));
|
||||
|
||||
var maybeData = db.getIcon("www.marginalia.nu");
|
||||
assertTrue(maybeData.isPresent());
|
||||
var actualData = maybeData.get();
|
||||
|
||||
assertEquals("text/plain", actualData.contentType());
|
||||
assertArrayEquals("hello world".getBytes(), actualData.imageData());
|
||||
|
||||
maybeData = db.getIcon("foobar");
|
||||
assertTrue(maybeData.isEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -10,7 +10,6 @@ import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.*;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@@ -227,7 +226,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -280,7 +279,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -329,7 +328,7 @@ class CrawlerRetreiverTest {
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -376,7 +375,7 @@ class CrawlerRetreiverTest {
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
doCrawlWithReferenceStream(specs,
|
||||
CrawledDomainReader.createDataStream(tempFileParquet1)
|
||||
new CrawlDataReference(tempFileParquet1)
|
||||
);
|
||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||
|
||||
@@ -397,7 +396,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -439,7 +438,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||
@@ -448,11 +447,9 @@ class CrawlerRetreiverTest {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
var stream = CrawledDomainReader.createDataStream(tempFileParquet1);
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
doCrawlWithReferenceStream(specs, stream);
|
||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
||||
|
||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
@@ -488,7 +485,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -509,12 +506,11 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||
new CrawlDataReference(stream));
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||
}
|
||||
catch (IOException | SQLException ex) {
|
||||
Assertions.fail(ex);
|
||||
|
@@ -3,7 +3,6 @@ package nu.marginalia.extractor;
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@@ -59,7 +58,7 @@ public class AtagExporter implements ExporterIf {
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||
exportLinks(tagWriter, stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@@ -1,7 +1,6 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
@@ -56,7 +55,7 @@ public class FeedExporter implements ExporterIf {
|
||||
}
|
||||
|
||||
Path crawlDataPath = inputDir.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||
exportFeeds(tagWriter, stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -75,7 +74,7 @@ public class FeedExporter implements ExporterIf {
|
||||
private boolean exportFeeds(FeedCsvWriter exporter, SerializableCrawlDataStream stream) throws IOException, URISyntaxException {
|
||||
FeedExtractor feedExtractor = new FeedExtractor(new LinkParser());
|
||||
|
||||
int size = stream.sizeHint();
|
||||
int size = stream.getSizeHint();
|
||||
|
||||
while (stream.hasNext()) {
|
||||
if (!(stream.next() instanceof CrawledDocument doc))
|
||||
|
@@ -5,7 +5,7 @@ import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.language.filter.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@@ -103,7 +103,7 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
{
|
||||
TLongHashSet words = new TLongHashSet(1000);
|
||||
|
||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||
while (stream.hasNext()) {
|
||||
if (Thread.interrupted())
|
||||
return;
|
||||
|
@@ -228,7 +228,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() throws IOException {
|
||||
public boolean hasNext() {
|
||||
if (dataStack == null) {
|
||||
query();
|
||||
}
|
||||
@@ -236,7 +236,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
public void close() {
|
||||
dataStack.clear();
|
||||
}
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -5,7 +5,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'gg.jte.gradle' version '3.1.15'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
@@ -41,6 +41,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
@@ -104,6 +105,8 @@ task compileTailwind {
|
||||
|
||||
doLast {
|
||||
exec {
|
||||
// If you're getting a build error like 'npm error could not determine executable to run'
|
||||
// pointing you here, you need to run `npm install -D tailwindcss`
|
||||
workingDir projectDir
|
||||
if (System.getProperty('os.name').toLowerCase().contains('windows')) {
|
||||
commandLine 'cmd', '/c', 'npx', 'tailwindcss',
|
||||
|
@@ -3,8 +3,14 @@ package nu.marginalia.search;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MediaType;
|
||||
import io.jooby.StatusCode;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.favicon.FaviconClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@@ -13,10 +19,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
public class SearchService extends JoobyService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchSiteSubscriptionService siteSubscriptionService;
|
||||
private final FaviconClient faviconClient;
|
||||
private final DbDomainQueries domainQueries;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||
@@ -33,12 +43,15 @@ public class SearchService extends JoobyService {
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchSiteSubscriptionService siteSubscriptionService,
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchCrosstalkService crosstalkService,
|
||||
SearchBrowseService searchBrowseService,
|
||||
FaviconClient faviconClient,
|
||||
DbDomainQueries domainQueries,
|
||||
SearchQueryService searchQueryService)
|
||||
throws Exception {
|
||||
super(params,
|
||||
@@ -51,8 +64,11 @@ public class SearchService extends JoobyService {
|
||||
new SearchAddToCrawlQueueService_(addToCrawlQueueService),
|
||||
new SearchBrowseService_(searchBrowseService)
|
||||
));
|
||||
this.websiteUrl = websiteUrl;
|
||||
|
||||
this.siteSubscriptionService = siteSubscriptionService;
|
||||
this.faviconClient = faviconClient;
|
||||
this.domainQueries = domainQueries;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -62,6 +78,35 @@ public class SearchService extends JoobyService {
|
||||
final String startTimeAttribute = "start-time";
|
||||
|
||||
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
|
||||
|
||||
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
|
||||
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
|
||||
|
||||
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||
String domain = ctx.path("domain").value();
|
||||
logger.info("Finding icon for domain {}", domain);
|
||||
domainQueries.getDomainId(new EdgeDomain(domain));
|
||||
try {
|
||||
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
||||
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
||||
|
||||
if (faviconMaybe.isEmpty()) {
|
||||
ctx.setResponseCode(404);
|
||||
return "";
|
||||
} else {
|
||||
var favicon = faviconMaybe.get();
|
||||
|
||||
ctx.responseStream(MediaType.valueOf(favicon.contentType()), consumer -> {
|
||||
consumer.write(favicon.bytes());
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
ctx.setResponseCode(404);
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
jooby.before((Context ctx) -> {
|
||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||
});
|
||||
@@ -80,5 +125,19 @@ public class SearchService extends JoobyService {
|
||||
});
|
||||
}
|
||||
|
||||
/** Redirect handler for the case when the user passes
|
||||
* an url like /site/https://example.com/, in this
|
||||
* scenario we want to extract the domain name and redirect
|
||||
* to /site/example.com/
|
||||
*/
|
||||
private Context handleSiteUrlRedirect(Context ctx) {
|
||||
var pv = ctx.path("*").value();
|
||||
int trailSlash = pv.indexOf('/');
|
||||
if (trailSlash > 0) {
|
||||
pv = pv.substring(0, trailSlash);
|
||||
}
|
||||
ctx.sendRedirect(StatusCode.TEMPORARY_REDIRECT, websiteUrl.withPath("site/" + pv));
|
||||
return ctx;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
|
||||
public String renderUrl() {
|
||||
|
||||
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
|
||||
if (query != null) {
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (profile != SearchProfile.NO_FILTER) {
|
||||
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
|
||||
return focusDomainId >= 0;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return results.isEmpty();
|
||||
}
|
||||
|
||||
public SearchFilters getFilters() {
|
||||
return filters;
|
||||
}
|
||||
|
@@ -81,6 +81,7 @@ public class SearchFilters {
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
|
@@ -56,7 +56,9 @@ public class SearchQueryService {
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error", ex);
|
||||
return errorPageService.serveError(SearchParameters.defaultsForQuery(websiteUrl, query, page));
|
||||
return errorPageService.serveError(
|
||||
SearchParameters.defaultsForQuery(websiteUrl, query, Objects.requireNonNullElse(page, 1))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -140,7 +140,8 @@ public class SearchSiteInfoService {
|
||||
) throws SQLException, ExecutionException {
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
// If we don't get a domain name, we redirect to the /site endpoint
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site"));
|
||||
}
|
||||
|
||||
page = Objects.requireNonNullElse(page, 1);
|
||||
|
@@ -9,7 +9,7 @@
|
||||
<span>
|
||||
Access logs containing IP-addresses are retained for up to 24 hours,
|
||||
anonymized logs with source addresses removed are sometimes kept longer
|
||||
for to help diagnosing bugs.
|
||||
to help diagnose bugs.
|
||||
</span>
|
||||
</div>
|
||||
<div class="flex space-y-4 flex-col">
|
||||
@@ -33,4 +33,4 @@
|
||||
</span>
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
</footer>
|
||||
|
@@ -9,6 +9,14 @@
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -23,7 +23,7 @@
|
||||
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<i class="fas fa-filter mr-3"></i>
|
||||
Filters
|
||||
</button>
|
||||
@@ -44,6 +44,11 @@
|
||||
<div class="grow"></div>
|
||||
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
|
||||
</div>
|
||||
@elseif (results.isEmpty())
|
||||
<div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
|
||||
No search results found. Try different search terms, or spelling variations. The search engine currently
|
||||
only supports queries in the English language.
|
||||
</div>
|
||||
@endif
|
||||
|
||||
<div class="space-y-4 sm:space-y-6">
|
||||
|
@@ -86,7 +86,7 @@
|
||||
@endif
|
||||
|
||||
@if(result.getFirst().isTracking())
|
||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
|
||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
|
||||
@endif
|
||||
|
||||
@if(result.getFirst().isScripts())
|
||||
@@ -94,11 +94,11 @@
|
||||
@endif
|
||||
|
||||
@if(result.getFirst().isAds())
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Ads</span>
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains adtech">Has Ads</span>
|
||||
@endif
|
||||
|
||||
@if(result.getFirst().isAffiliate())
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
|
||||
@endif
|
||||
|
||||
</span>
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
@param SearchFilters filters
|
||||
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
|
||||
<div class="space-y-6 sticky top-4">
|
||||
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
||||
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
||||
|
@@ -53,7 +53,7 @@
|
||||
@endif
|
||||
|
||||
@if(details.isTracking())
|
||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Track</span>
|
||||
<span class="px-1 bg-yellow-100 text-yellow-700 dark:border dark:border-yellow-600 dark:text-yellow-400 dark:bg-black rounded" title="Uses tracking scripts">Tracking</span>
|
||||
@endif
|
||||
|
||||
@if(details.isScripts())
|
||||
@@ -65,7 +65,7 @@
|
||||
@endif
|
||||
|
||||
@if(details.isAffiliate())
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Affiliate</span>
|
||||
<span class="px-1 bg-red-100 text-red-700 dark:border dark:border-red-600 dark:text-red-400 dark:bg-black rounded" title="Contains Affiliate Link">Has Affiliate</span>
|
||||
@endif
|
||||
|
||||
</div>
|
||||
|
@@ -9,8 +9,8 @@
|
||||
|
||||
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
||||
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||
<div class="flex place-items-baseline space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<i class="fa fa-globe"></i>
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
||||
<span>${siteInfo.domain()}</span>
|
||||
<div class="grow">
|
||||
</div>
|
||||
|
@@ -9,6 +9,14 @@ module.exports = {
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
apply from: "$rootProject.projectDir/docker.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation project(':code:functions:live-capture')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
@@ -32,20 +37,16 @@ dependencies {
|
||||
implementation project(':code:functions:domain-info')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.commons.io
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
@@ -59,9 +60,7 @@ dependencies {
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
implementation libs.bundles.jooby
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.opencsv
|
||||
implementation libs.trove
|
||||
|
@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import io.jooby.ExecutionMode;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.livecapture.LivecaptureModule;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
|
||||
var configuration = injector.getInstance(ServiceConfiguration.class);
|
||||
orchestrateBoot(registry, configuration);
|
||||
|
||||
injector.getInstance(AssistantMain.class);
|
||||
var main = injector.getInstance(AssistantMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
|
||||
{
|
||||
main.start(this);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void start(Jooby jooby) {
|
||||
service.startJooby(jooby);
|
||||
}
|
||||
}
|
||||
|
@@ -2,27 +2,27 @@ package nu.marginalia.assistant;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||
import nu.marginalia.functions.math.MathGrpcService;
|
||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.rss.svc.FeedsGrpcService;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.JoobyService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class AssistantService extends SparkService {
|
||||
public class AssistantService extends JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
@org.jetbrains.annotations.NotNull
|
||||
private final ScreenshotService screenshotService;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@Inject
|
||||
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
|
||||
List.of(domainInfoGrpcService,
|
||||
mathGrpcService,
|
||||
liveCaptureGrpcService,
|
||||
feedsGrpcService));
|
||||
feedsGrpcService),
|
||||
List.of());
|
||||
this.screenshotService = screenshotService;
|
||||
|
||||
this.suggestions = suggestions;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
private Object getSuggestions(Request request, Response response) {
|
||||
response.type("application/json");
|
||||
var param = request.queryParams("partial");
|
||||
if (param == null) {
|
||||
public void startJooby(Jooby jooby) {
|
||||
super.startJooby(jooby);
|
||||
|
||||
jooby.get("/suggest/", this::getSuggestions);
|
||||
jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
|
||||
}
|
||||
|
||||
private String getSuggestions(Context context) {
|
||||
context.setResponseType("application/json");
|
||||
var param = context.query("partial");
|
||||
if (param.isMissing()) {
|
||||
logger.warn("Bad parameter, partial is null");
|
||||
Spark.halt(500);
|
||||
context.setResponseCode(500);
|
||||
return "{}";
|
||||
}
|
||||
return suggestions.getSuggestions(10, param);
|
||||
}
|
||||
|
||||
private String convertToJson(Object o) {
|
||||
return gson.toJson(o);
|
||||
return gson.toJson(suggestions.getSuggestions(10, param.value()));
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,118 @@
|
||||
package nu.marginalia.assistant;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import io.jooby.Context;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class ScreenshotService {
|
||||
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
|
||||
this.domainQueries = dbDomainQueries;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public boolean hasScreenshot(int domainId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT TRUE
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, domainId);
|
||||
var rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getBoolean(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Object serveScreenshotRequest(Context context) {
|
||||
if (Strings.isNullOrEmpty(context.path("id").value(""))) {
|
||||
context.setResponseCode(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
int id = context.path("id").intValue();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT CONTENT_TYPE, DATA
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, id);
|
||||
var rsp = ps.executeQuery();
|
||||
if (rsp.next()) {
|
||||
context.setResponseType(rsp.getString(1));
|
||||
context.setResponseCode(200);
|
||||
context.setResponseHeader("Cache-control", "public,max-age=3600");
|
||||
|
||||
try (var rs = context.responseStream()) {
|
||||
IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("IO error", ex);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
|
||||
context.setResponseType("image/svg+xml");
|
||||
|
||||
var name = domainQueries.getDomain(id).map(Object::toString)
|
||||
.orElse("[Screenshot Not Yet Captured]");
|
||||
|
||||
return """
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="640px"
|
||||
height="480px"
|
||||
viewBox="0 0 640 480"
|
||||
version="1.1">
|
||||
<g>
|
||||
<rect
|
||||
style="fill:#808080"
|
||||
id="rect288"
|
||||
width="595.41992"
|
||||
height="430.01825"
|
||||
x="23.034981"
|
||||
y="27.850344" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:100px;fill:#909090;font-family:sans-serif;"
|
||||
x="20"
|
||||
y="120">Placeholder</text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:32px;fill:#000000;font-family:monospace;"
|
||||
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
|
||||
</g>
|
||||
</svg>
|
||||
""".formatted(name);
|
||||
}
|
||||
|
||||
}
|
@@ -2,7 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
@@ -42,6 +42,8 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:favicon')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.executor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.execution.*;
|
||||
import nu.marginalia.functions.favicon.FaviconGrpcService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
@@ -24,6 +25,7 @@ public class ExecutorSvc extends SparkService {
|
||||
ExecutorCrawlGrpcService executorCrawlGrpcService,
|
||||
ExecutorSideloadGrpcService executorSideloadGrpcService,
|
||||
ExecutorExportGrpcService executorExportGrpcService,
|
||||
FaviconGrpcService faviconGrpcService,
|
||||
ExecutionInit executionInit,
|
||||
ExecutorFileTransferService fileTransferService) throws Exception {
|
||||
super(params,
|
||||
@@ -31,7 +33,8 @@ public class ExecutorSvc extends SparkService {
|
||||
List.of(executorGrpcService,
|
||||
executorCrawlGrpcService,
|
||||
executorSideloadGrpcService,
|
||||
executorExportGrpcService)
|
||||
executorExportGrpcService,
|
||||
faviconGrpcService)
|
||||
);
|
||||
|
||||
this.executionInit = executionInit;
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
application {
|
||||
|
@@ -3,7 +3,7 @@ package nu.marginalia.tools;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
|
||||
@@ -40,7 +40,7 @@ public class ExperimentRunnerMain {
|
||||
Path basePath = Path.of(args[0]);
|
||||
for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) {
|
||||
Path crawlDataPath = basePath.resolve(item.relPath());
|
||||
try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(crawlDataPath)) {
|
||||
experiment.process(stream);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@@ -26,7 +26,7 @@ import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
||||
@@ -152,7 +152,7 @@ public class IntegrationTest {
|
||||
|
||||
/** PROCESS CRAWL DATA */
|
||||
|
||||
var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet));
|
||||
var processedDomain = domainProcessor.fullProcessing(SerializableCrawlDataStream.openDataStream(crawlDataParquet));
|
||||
|
||||
System.out.println(processedDomain);
|
||||
|
||||
|
@@ -3,7 +3,7 @@ plugins {
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
id 'com.google.cloud.tools.jib' version '3.4.4'
|
||||
}
|
||||
|
||||
java {
|
||||
|
@@ -16,8 +16,6 @@ platforms, but for lack of suitable hardware, this can not be guaranteed.
|
||||
The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
|
||||
graalce is a good distribution choice but it doesn't matter too much.
|
||||
|
||||
**Tailwindcss** - Install NPM and run `npm install -D tailwindcss`
|
||||
|
||||
## Quick Set up
|
||||
|
||||
[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
|
||||
|
@@ -74,3 +74,7 @@ download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/Mar
|
||||
download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569
|
||||
|
||||
popd
|
||||
|
||||
pushd $(dirname $0)/..
|
||||
npm install -D tailwindcss@3
|
||||
popd
|
||||
|
@@ -16,7 +16,8 @@ include 'code:services-application:status-service'
|
||||
|
||||
include 'code:functions:math'
|
||||
include 'code:functions:math:api'
|
||||
|
||||
include 'code:functions:favicon'
|
||||
include 'code:functions:favicon:api'
|
||||
include 'code:functions:domain-info'
|
||||
include 'code:functions:domain-info:api'
|
||||
|
||||
@@ -160,12 +161,12 @@ dependencyResolutionManagement {
|
||||
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
|
||||
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
|
||||
|
||||
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
|
||||
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
|
||||
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
|
||||
|
||||
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
|
||||
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
|
||||
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
|
||||
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
|
||||
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
|
||||
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
|
||||
|
||||
library('notnull','org.jetbrains','annotations').version('24.0.0')
|
||||
|
||||
@@ -234,11 +235,12 @@ dependencyResolutionManagement {
|
||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||
|
||||
library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-5-SNAPSHOT')
|
||||
library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
|
||||
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||
|
||||
library('wiremock', 'org.wiremock','wiremock').version('3.11.0')
|
||||
library('jte','gg.jte','jte').version('3.1.15')
|
||||
|
||||
bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])
|
||||
|
Reference in New Issue
Block a user