mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
25 Commits
deploy-008
...
deploy-010
Author | SHA1 | Date | |
---|---|---|---|
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 | ||
|
7671f0d9e4 | ||
|
44d6bc71b7 | ||
|
9d302e2973 | ||
|
f553701224 | ||
|
f076d05595 | ||
|
b513809710 | ||
|
7519b28e21 |
@@ -22,6 +22,7 @@ public class DbDomainQueries {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@@ -59,6 +60,34 @@ public class DbDomainQueries {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||
try {
|
||||
return domainWithNodeCache.get(domain, () -> {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@@ -145,4 +174,6 @@ public class DbDomainQueries {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||
}
|
||||
|
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
|
||||
while (nets.hasMoreElements()) {
|
||||
NetworkInterface netif = nets.nextElement();
|
||||
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||
if (!netif.isUp() || netif.isLoopback()) {
|
||||
continue;
|
||||
}
|
||||
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||
while (inetAddresses.hasMoreElements()) {
|
||||
InetAddress addr = inetAddresses.nextElement();
|
||||
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||
return addr.getHostAddress();
|
||||
}
|
||||
|
@@ -15,6 +15,7 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
@@ -106,9 +107,12 @@ public class JoobyService {
|
||||
config.externalAddress());
|
||||
|
||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
|
||||
if (Files.exists(Path.of("/app/resources/jte")) || Files.exists(Path.of("/app/classes/jte-precompiled"))) {
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
}
|
||||
if (Files.exists(Path.of("/app/resources/static"))) {
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
}
|
||||
var options = new ServerOptions();
|
||||
options.setHost(config.bindAddress());
|
||||
options.setPort(restEndpoint.port());
|
||||
|
@@ -6,25 +6,36 @@ import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
import org.eclipse.jetty.servlet.ServletHolder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
try {
|
||||
Server server = new Server(new InetSocketAddress(configuration.bindAddress(), configuration.metricsPort()));
|
||||
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
ServletContextHandler context = new ServletContextHandler();
|
||||
context.setContextPath("/");
|
||||
server.setHandler(context);
|
||||
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
|
||||
server.start();
|
||||
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||
|
||||
server.start();
|
||||
}
|
||||
catch (Exception|NoSuchMethodError ex) {
|
||||
logger.error("Failed to set up metrics server", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id "com.google.protobuf" version "0.9.4"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
jar.archiveBaseName = 'favicon-api'
|
||||
|
||||
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
@@ -0,0 +1,39 @@
|
||||
package nu.marginalia.api.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public class FaviconClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||
|
||||
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createMulti(
|
||||
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||
FaviconAPIGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public record FaviconData(byte[] bytes, String contentType) {}
|
||||
|
||||
|
||||
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||
.forNode(node)
|
||||
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||
|
||||
if (rsp.getData().isEmpty())
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||
}
|
||||
|
||||
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
||||
syntax="proto3";
|
||||
package marginalia.api.favicon;
|
||||
|
||||
option java_package="nu.marginalia.api.favicon";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service FaviconAPI {
|
||||
/** Fetches information about a domain. */
|
||||
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||
}
|
||||
|
||||
message RpcFaviconRequest {
|
||||
string domain = 1;
|
||||
}
|
||||
|
||||
message RpcFaviconResponse {
|
||||
string domain = 1;
|
||||
bytes data = 2;
|
||||
string contentType = 3;
|
||||
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,48 @@
|
||||
package nu.marginalia.functions.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||
private final DomainStateDb domainStateDb;
|
||||
|
||||
@Inject
|
||||
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||
this.domainStateDb = domainStateDb;
|
||||
}
|
||||
|
||||
public boolean shouldRegisterService() {
|
||||
return domainStateDb.isAvailable();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||
|
||||
RpcFaviconResponse response;
|
||||
if (icon.isEmpty()) {
|
||||
response = RpcFaviconResponse.newBuilder().build();
|
||||
}
|
||||
else {
|
||||
var iconRecord = icon.get();
|
||||
response = RpcFaviconResponse.newBuilder()
|
||||
.setContentType(iconRecord.contentType())
|
||||
.setDomain(request.getDomain())
|
||||
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||
.build();
|
||||
}
|
||||
|
||||
responseObserver.onNext(response);
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
}
|
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
import nu.marginalia.slop.column.string.StringColumn;
|
||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
// Basic information
|
||||
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
||||
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
|
||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
public static class KeywordsProjectionReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final IntColumn.Reader htmlFeaturesReader;
|
||||
private final LongColumn.Reader domainMetadataReader;
|
||||
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class MetadataReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final TxtStringColumn.Reader urlsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader urlsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final StringColumn.Reader titlesReader;
|
||||
private final StringColumn.Reader descriptionsReader;
|
||||
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class Writer extends SlopTable {
|
||||
private final TxtStringColumn.Writer domainsWriter;
|
||||
private final TxtStringColumn.Writer urlsWriter;
|
||||
private final StringColumn.Writer domainsWriter;
|
||||
private final StringColumn.Writer urlsWriter;
|
||||
private final VarintColumn.Writer ordinalsWriter;
|
||||
private final EnumColumn.Writer statesWriter;
|
||||
private final StringColumn.Writer stateReasonsWriter;
|
||||
|
@@ -248,9 +248,14 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// (this happens when the process is restarted after a crash or a shutdown)
|
||||
tasksDone.set(workLog.countFinishedJobs());
|
||||
|
||||
// Create crawl tasks and submit them to the pool for execution
|
||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||
List<CrawlTask> taskList = new ArrayList<>();
|
||||
|
||||
// Create crawl tasks
|
||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
var task = new CrawlTask(
|
||||
@@ -261,11 +266,22 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
domainStateDb,
|
||||
workLog);
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||
pool.submitQuietly(task);
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
|
||||
pool.shutDown();
|
||||
@@ -290,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||
* or if it can be discarded */
|
||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||
if (!task.canRun()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
try {
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
catch (RuntimeException ex) {
|
||||
logger.error("Failed to submit task " + task.domain, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||
}
|
||||
@@ -346,9 +384,20 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.id = Integer.toHexString(domain.hashCode());
|
||||
}
|
||||
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
|
||||
if (workLog.isJobFinished(domain)) { // No-Op
|
||||
logger.info("Omitting task {}, as it is already run", domain);
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
@@ -403,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
@@ -494,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
//
|
||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||
if (!inputPath.endsWith(".parquet")) {
|
||||
if (!inputPath.toString().endsWith(".parquet")) {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
|
@@ -1,5 +1,8 @@
|
||||
package nu.marginalia.crawl;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -63,7 +66,29 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||
|
||||
public DomainStateDb(Path filename) throws SQLException {
|
||||
@Inject
|
||||
public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
|
||||
this(findFilename(fileStorageService));
|
||||
}
|
||||
|
||||
private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
|
||||
var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
|
||||
|
||||
if (fsId.isPresent()) {
|
||||
var fs = fileStorageService.getStorage(fsId.get());
|
||||
return fs.asPath().resolve("domainstate.db");
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public DomainStateDb(@Nullable Path filename) throws SQLException {
|
||||
if (null == filename) {
|
||||
connection = null;
|
||||
return;
|
||||
}
|
||||
|
||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
|
||||
@@ -90,11 +115,18 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isAvailable() {
|
||||
return connection != null;
|
||||
}
|
||||
|
||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||
VALUES(?, ?, ?)
|
||||
@@ -110,6 +142,9 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public Optional<FaviconRecord> getIcon(String domain) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
@@ -130,6 +165,8 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
@@ -146,6 +183,9 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> get(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||
FROM summary
|
||||
|
@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
return new SitemapRetriever();
|
||||
}
|
||||
|
||||
/** Recursively fetch sitemaps */
|
||||
@Override
|
||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||
try {
|
||||
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||
var head = sitemapQueue.removeFirst();
|
||||
|
||||
switch (fetchSitemap(head)) {
|
||||
switch (fetchSingleSitemap(head)) {
|
||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||
|
||||
for (var url : urls) {
|
||||
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
else
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
|
||||
if (fileName.endsWith(".slop.zip")) {
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
if (fileName.endsWith(".slop.zip")) {
|
||||
else if (fileName.endsWith(".parquet")) {
|
||||
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||
try {
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
@@ -41,6 +41,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
|
@@ -3,10 +3,14 @@ package nu.marginalia.search;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MediaType;
|
||||
import io.jooby.StatusCode;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.favicon.FaviconClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@@ -15,11 +19,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
public class SearchService extends JoobyService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchSiteSubscriptionService siteSubscriptionService;
|
||||
private final FaviconClient faviconClient;
|
||||
private final DbDomainQueries domainQueries;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||
@@ -43,6 +50,8 @@ public class SearchService extends JoobyService {
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchCrosstalkService crosstalkService,
|
||||
SearchBrowseService searchBrowseService,
|
||||
FaviconClient faviconClient,
|
||||
DbDomainQueries domainQueries,
|
||||
SearchQueryService searchQueryService)
|
||||
throws Exception {
|
||||
super(params,
|
||||
@@ -58,6 +67,8 @@ public class SearchService extends JoobyService {
|
||||
this.websiteUrl = websiteUrl;
|
||||
|
||||
this.siteSubscriptionService = siteSubscriptionService;
|
||||
this.faviconClient = faviconClient;
|
||||
this.domainQueries = domainQueries;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -71,6 +82,31 @@ public class SearchService extends JoobyService {
|
||||
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
|
||||
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
|
||||
|
||||
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||
String domain = ctx.path("domain").value();
|
||||
logger.info("Finding icon for domain {}", domain);
|
||||
domainQueries.getDomainId(new EdgeDomain(domain));
|
||||
try {
|
||||
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
||||
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
||||
|
||||
if (faviconMaybe.isEmpty()) {
|
||||
ctx.setResponseCode(404);
|
||||
return "";
|
||||
} else {
|
||||
var favicon = faviconMaybe.get();
|
||||
|
||||
ctx.responseStream(MediaType.valueOf(favicon.contentType()), consumer -> {
|
||||
consumer.write(favicon.bytes());
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
ctx.setResponseCode(404);
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
jooby.before((Context ctx) -> {
|
||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||
});
|
||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
@@ -86,8 +86,10 @@ public record SearchParameters(WebsiteUrl url,
|
||||
public String renderUrl() {
|
||||
|
||||
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
|
||||
if (query != null) {
|
||||
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||
}
|
||||
if (profile != SearchProfile.NO_FILTER) {
|
||||
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
@@ -67,6 +67,10 @@ public class DecoratedSearchResults {
|
||||
return focusDomainId >= 0;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return results.isEmpty();
|
||||
}
|
||||
|
||||
public SearchFilters getFilters() {
|
||||
return filters;
|
||||
}
|
||||
|
@@ -81,6 +81,7 @@ public class SearchFilters {
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
|
@@ -9,6 +9,14 @@
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -23,7 +23,7 @@
|
||||
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<i class="fas fa-filter mr-3"></i>
|
||||
Filters
|
||||
</button>
|
||||
@@ -44,6 +44,11 @@
|
||||
<div class="grow"></div>
|
||||
<a href="${results.getParams().renderUrlWithoutSiteFocus()}" class="fa fa-remove"></a>
|
||||
</div>
|
||||
@elseif (results.isEmpty())
|
||||
<div class="border dark:border-gray-600 rounded flex space-x-4 bg-white dark:bg-gray-800 text-gray-600 dark:text-gray-100 text-sm p-4 items-center">
|
||||
No search results found. Try different search terms, or spelling variations. The search engine currently
|
||||
only supports queries in the English language.
|
||||
</div>
|
||||
@endif
|
||||
|
||||
<div class="space-y-4 sm:space-y-6">
|
||||
|
@@ -26,15 +26,15 @@
|
||||
It operates a bit like a clock, starting at the top and working its way around clockwise.</p>
|
||||
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 90)
|
||||
@template.serp.part.matchogram(mask = 90, domain = "example.com")
|
||||
<div>This is by the beginning</div>
|
||||
</div>
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 90L<<26)
|
||||
@template.serp.part.matchogram(mask = 90L<<26, domain = "example.com")
|
||||
<div>This is in the middle</div>
|
||||
</div>
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 5L<<48)
|
||||
@template.serp.part.matchogram(mask = 5L<<48, domain = "example.com")
|
||||
<div>This is toward the end</div>
|
||||
</div>
|
||||
|
||||
|
@@ -1,11 +1,13 @@
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param long mask
|
||||
@param String domain
|
||||
|
||||
<svg width="40" height="40">
|
||||
<svg width="40" height="40"
|
||||
style="background-image: url('/site/${domain}/favicon'); background-repeat: no-repeat; background-size: 16px 16px; background-position: center; ">
|
||||
<circle
|
||||
cx="18"
|
||||
cy="18"
|
||||
cx="20"
|
||||
cy="20"
|
||||
r="16"
|
||||
fill="none"
|
||||
stroke="#eee"
|
||||
@@ -13,10 +15,10 @@
|
||||
/>
|
||||
@for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray())
|
||||
<line
|
||||
x1="${18 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y1="${18 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x2="${18 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y2="${18 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x1="${20 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y1="${20 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x2="${20 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y2="${20 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
stroke="#444"
|
||||
stroke-width="2"
|
||||
/>
|
||||
|
@@ -12,7 +12,7 @@
|
||||
<div class="flex flex-col grow" >
|
||||
<div class="flex flex-row space-x-2 place-items-center">
|
||||
<div class="flex-0" title="Match density">
|
||||
@template.serp.part.matchogram(mask = result.first.positionsMask)
|
||||
@template.serp.part.matchogram(mask = result.first.positionsMask, domain=result.getFirst().url.domain.toString())
|
||||
</div>
|
||||
<div class="flex grow justify-between items-start">
|
||||
<div class="flex-1">
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
@param SearchFilters filters
|
||||
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
|
||||
<div class="space-y-6 sticky top-4">
|
||||
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
||||
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
||||
|
@@ -9,8 +9,8 @@
|
||||
|
||||
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
||||
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||
<div class="flex place-items-baseline space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<i class="fa fa-globe"></i>
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
||||
<span>${siteInfo.domain()}</span>
|
||||
<div class="grow">
|
||||
</div>
|
||||
|
@@ -9,6 +9,14 @@ module.exports = {
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -23,7 +23,12 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
apply from: "$rootProject.projectDir/docker.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
|
||||
implementation project(':code:functions:live-capture')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
@@ -32,20 +37,16 @@ dependencies {
|
||||
implementation project(':code:functions:domain-info')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
|
||||
implementation project(':code:libraries:geo-ip')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.prometheus
|
||||
implementation libs.commons.io
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
@@ -59,9 +60,7 @@ dependencies {
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
implementation libs.bundles.jooby
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.opencsv
|
||||
implementation libs.trove
|
||||
|
@@ -3,6 +3,8 @@ package nu.marginalia.assistant;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import io.jooby.ExecutionMode;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.livecapture.LivecaptureModule;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
@@ -38,8 +40,17 @@ public class AssistantMain extends MainClass {
|
||||
var configuration = injector.getInstance(ServiceConfiguration.class);
|
||||
orchestrateBoot(registry, configuration);
|
||||
|
||||
injector.getInstance(AssistantMain.class);
|
||||
var main = injector.getInstance(AssistantMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
Jooby.runApp(new String[] { "application.env=prod" }, ExecutionMode.WORKER, () -> new Jooby() {
|
||||
{
|
||||
main.start(this);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void start(Jooby jooby) {
|
||||
service.startJooby(jooby);
|
||||
}
|
||||
}
|
||||
|
@@ -2,27 +2,27 @@ package nu.marginalia.assistant;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||
import nu.marginalia.functions.math.MathGrpcService;
|
||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.rss.svc.FeedsGrpcService;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.JoobyService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class AssistantService extends SparkService {
|
||||
public class AssistantService extends JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
@org.jetbrains.annotations.NotNull
|
||||
private final ScreenshotService screenshotService;
|
||||
private final Suggestions suggestions;
|
||||
|
||||
@Inject
|
||||
@@ -39,30 +39,30 @@ public class AssistantService extends SparkService {
|
||||
List.of(domainInfoGrpcService,
|
||||
mathGrpcService,
|
||||
liveCaptureGrpcService,
|
||||
feedsGrpcService));
|
||||
feedsGrpcService),
|
||||
List.of());
|
||||
this.screenshotService = screenshotService;
|
||||
|
||||
this.suggestions = suggestions;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
Spark.get("/screenshot/:id", screenshotService::serveScreenshotRequest);
|
||||
Spark.get("/suggest/", this::getSuggestions, this::convertToJson);
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
private Object getSuggestions(Request request, Response response) {
|
||||
response.type("application/json");
|
||||
var param = request.queryParams("partial");
|
||||
if (param == null) {
|
||||
public void startJooby(Jooby jooby) {
|
||||
super.startJooby(jooby);
|
||||
|
||||
jooby.get("/suggest/", this::getSuggestions);
|
||||
jooby.get("/screenshot/{id}", screenshotService::serveScreenshotRequest);
|
||||
}
|
||||
|
||||
private String getSuggestions(Context context) {
|
||||
context.setResponseType("application/json");
|
||||
var param = context.query("partial");
|
||||
if (param.isMissing()) {
|
||||
logger.warn("Bad parameter, partial is null");
|
||||
Spark.halt(500);
|
||||
context.setResponseCode(500);
|
||||
return "{}";
|
||||
}
|
||||
return suggestions.getSuggestions(10, param);
|
||||
}
|
||||
|
||||
private String convertToJson(Object o) {
|
||||
return gson.toJson(o);
|
||||
return gson.toJson(suggestions.getSuggestions(10, param.value()));
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -0,0 +1,118 @@
|
||||
package nu.marginalia.assistant;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import io.jooby.Context;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class ScreenshotService {
|
||||
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ScreenshotService(DbDomainQueries dbDomainQueries, HikariDataSource dataSource) {
|
||||
this.domainQueries = dbDomainQueries;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public boolean hasScreenshot(int domainId) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT TRUE
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, domainId);
|
||||
var rs = ps.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getBoolean(1);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Object serveScreenshotRequest(Context context) {
|
||||
if (Strings.isNullOrEmpty(context.path("id").value(""))) {
|
||||
context.setResponseCode(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
int id = context.path("id").intValue();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var ps = conn.prepareStatement("""
|
||||
SELECT CONTENT_TYPE, DATA
|
||||
FROM DATA_DOMAIN_SCREENSHOT
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_SCREENSHOT.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=?
|
||||
""")) {
|
||||
ps.setInt(1, id);
|
||||
var rsp = ps.executeQuery();
|
||||
if (rsp.next()) {
|
||||
context.setResponseType(rsp.getString(1));
|
||||
context.setResponseCode(200);
|
||||
context.setResponseHeader("Cache-control", "public,max-age=3600");
|
||||
|
||||
try (var rs = context.responseStream()) {
|
||||
IOUtils.copy(rsp.getBlob(2).getBinaryStream(), rs);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("IO error", ex);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
|
||||
context.setResponseType("image/svg+xml");
|
||||
|
||||
var name = domainQueries.getDomain(id).map(Object::toString)
|
||||
.orElse("[Screenshot Not Yet Captured]");
|
||||
|
||||
return """
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="640px"
|
||||
height="480px"
|
||||
viewBox="0 0 640 480"
|
||||
version="1.1">
|
||||
<g>
|
||||
<rect
|
||||
style="fill:#808080"
|
||||
id="rect288"
|
||||
width="595.41992"
|
||||
height="430.01825"
|
||||
x="23.034981"
|
||||
y="27.850344" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:100px;fill:#909090;font-family:sans-serif;"
|
||||
x="20"
|
||||
y="120">Placeholder</text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:32px;fill:#000000;font-family:monospace;"
|
||||
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
|
||||
</g>
|
||||
</svg>
|
||||
""".formatted(name);
|
||||
}
|
||||
|
||||
}
|
@@ -42,6 +42,8 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:favicon')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.executor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.execution.*;
|
||||
import nu.marginalia.functions.favicon.FaviconGrpcService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
@@ -24,6 +25,7 @@ public class ExecutorSvc extends SparkService {
|
||||
ExecutorCrawlGrpcService executorCrawlGrpcService,
|
||||
ExecutorSideloadGrpcService executorSideloadGrpcService,
|
||||
ExecutorExportGrpcService executorExportGrpcService,
|
||||
FaviconGrpcService faviconGrpcService,
|
||||
ExecutionInit executionInit,
|
||||
ExecutorFileTransferService fileTransferService) throws Exception {
|
||||
super(params,
|
||||
@@ -31,7 +33,8 @@ public class ExecutorSvc extends SparkService {
|
||||
List.of(executorGrpcService,
|
||||
executorCrawlGrpcService,
|
||||
executorSideloadGrpcService,
|
||||
executorExportGrpcService)
|
||||
executorExportGrpcService,
|
||||
faviconGrpcService)
|
||||
);
|
||||
|
||||
this.executionInit = executionInit;
|
||||
|
@@ -16,7 +16,8 @@ include 'code:services-application:status-service'
|
||||
|
||||
include 'code:functions:math'
|
||||
include 'code:functions:math:api'
|
||||
|
||||
include 'code:functions:favicon'
|
||||
include 'code:functions:favicon:api'
|
||||
include 'code:functions:domain-info'
|
||||
include 'code:functions:domain-info:api'
|
||||
|
||||
@@ -160,12 +161,12 @@ dependencyResolutionManagement {
|
||||
library('prometheus-server', 'io.prometheus', 'simpleclient_httpserver').version('0.16.0')
|
||||
library('prometheus-hotspot', 'io.prometheus', 'simpleclient_hotspot').version('0.16.0')
|
||||
|
||||
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('1.7.36')
|
||||
library('slf4j.api', 'org.slf4j', 'slf4j-api').version('2.0.3')
|
||||
library('slf4j.jdk14', 'org.slf4j', 'slf4j-jdk14').version('2.0.3')
|
||||
|
||||
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.17.2')
|
||||
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.17.2')
|
||||
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j-impl').version('2.17.2')
|
||||
library('log4j.api', 'org.apache.logging.log4j', 'log4j-api').version('2.24.3')
|
||||
library('log4j.core', 'org.apache.logging.log4j', 'log4j-core').version('2.24.3')
|
||||
library('log4j.slf4j', 'org.apache.logging.log4j', 'log4j-slf4j2-impl').version('2.24.3')
|
||||
|
||||
library('notnull','org.jetbrains','annotations').version('24.0.0')
|
||||
|
||||
|
Reference in New Issue
Block a user