mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
24 Commits
deploy-009
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 | ||
|
42bd79a609 | ||
|
b91c1e528a | ||
|
b1130d7a04 | ||
|
8364bcdc97 | ||
|
626cab5fab | ||
|
cfd4712191 | ||
|
9f18ced73d | ||
|
18e91269ab | ||
|
e315ca5758 | ||
|
3ceea17c1d | ||
|
b34527c1a3 | ||
|
185bf28fca | ||
|
78cc25584a |
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=23
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.4'
|
||||
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -22,6 +22,7 @@ public class DbDomainQueries {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DbDomainQueries.class);
|
||||
|
||||
private final Cache<EdgeDomain, Integer> domainIdCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<EdgeDomain, DomainIdWithNode> domainWithNodeCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<Integer, EdgeDomain> domainNameCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
private final Cache<String, List<DomainWithNode>> siblingsCache = CacheBuilder.newBuilder().maximumSize(10_000).build();
|
||||
|
||||
@@ -59,6 +60,34 @@ public class DbDomainQueries {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public DomainIdWithNode getDomainIdWithNode(EdgeDomain domain) throws NoSuchElementException {
|
||||
try {
|
||||
return domainWithNodeCache.get(domain, () -> {
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID, NODE_AFFINITY FROM EC_DOMAIN WHERE DOMAIN_NAME=?")) {
|
||||
|
||||
stmt.setString(1, domain.toString());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new DomainIdWithNode(rsp.getInt(1), rsp.getInt(2));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@@ -145,4 +174,6 @@ public class DbDomainQueries {
|
||||
return nodeAffinity > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record DomainIdWithNode (int domainId, int nodeAffinity) { }
|
||||
}
|
||||
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
public EdgeDomain(String host) {
|
||||
public EdgeDomain(@Nonnull String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
host = host.toLowerCase();
|
||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
public static String getTopDomain(String host) {
|
||||
return new EdgeDomain(host).topDomain;
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
public String getLongDomainKey() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
} else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
||||
ret.append(":");
|
||||
ret.append(subDomain);
|
||||
}
|
||||
|
||||
return ret.toString().toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
||||
class ZkServiceRegistryTest {
|
||||
private static final int ZOOKEEPER_PORT = 2181;
|
||||
private static final GenericContainer<?> zookeeper =
|
||||
new GenericContainer<>("zookeeper:3.8.0")
|
||||
new GenericContainer<>("zookeeper:3.8")
|
||||
.withExposedPorts(ZOOKEEPER_PORT);
|
||||
|
||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||
|
47
code/functions/favicon/api/build.gradle
Normal file
47
code/functions/favicon/api/build.gradle
Normal file
@@ -0,0 +1,47 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id "com.google.protobuf" version "0.9.4"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
jar.archiveBaseName = 'favicon-api'
|
||||
|
||||
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
@@ -0,0 +1,39 @@
|
||||
package nu.marginalia.api.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcMultiNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public class FaviconClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FaviconClient.class);
|
||||
|
||||
private final GrpcMultiNodeChannelPool<FaviconAPIGrpc.FaviconAPIBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public FaviconClient(GrpcChannelPoolFactory factory) {
|
||||
this.channelPool = factory.createMulti(
|
||||
ServiceKey.forGrpcApi(FaviconAPIGrpc.class, ServicePartition.multi()),
|
||||
FaviconAPIGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
public record FaviconData(byte[] bytes, String contentType) {}
|
||||
|
||||
|
||||
public Optional<FaviconData> getFavicon(String domain, int node) {
|
||||
RpcFaviconResponse rsp = channelPool.call(FaviconAPIGrpc.FaviconAPIBlockingStub::getFavicon)
|
||||
.forNode(node)
|
||||
.run(RpcFaviconRequest.newBuilder().setDomain(domain).build());
|
||||
|
||||
if (rsp.getData().isEmpty())
|
||||
return Optional.empty();
|
||||
|
||||
return Optional.of(new FaviconData(rsp.getData().toByteArray(), rsp.getContentType()));
|
||||
}
|
||||
|
||||
}
|
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
20
code/functions/favicon/api/src/main/protobuf/favicon.proto
Normal file
@@ -0,0 +1,20 @@
|
||||
syntax="proto3";
|
||||
package marginalia.api.favicon;
|
||||
|
||||
option java_package="nu.marginalia.api.favicon";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service FaviconAPI {
|
||||
/** Fetches information about a domain. */
|
||||
rpc getFavicon(RpcFaviconRequest) returns (RpcFaviconResponse) {}
|
||||
}
|
||||
|
||||
message RpcFaviconRequest {
|
||||
string domain = 1;
|
||||
}
|
||||
|
||||
message RpcFaviconResponse {
|
||||
string domain = 1;
|
||||
bytes data = 2;
|
||||
string contentType = 3;
|
||||
}
|
49
code/functions/favicon/build.gradle
Normal file
49
code/functions/favicon/build.gradle
Normal file
@@ -0,0 +1,49 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:processes:crawling-process')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,48 @@
|
||||
package nu.marginalia.functions.favicon;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.ByteString;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import nu.marginalia.api.favicon.FaviconAPIGrpc;
|
||||
import nu.marginalia.api.favicon.RpcFaviconRequest;
|
||||
import nu.marginalia.api.favicon.RpcFaviconResponse;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
@Singleton
|
||||
public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implements DiscoverableService {
|
||||
private final DomainStateDb domainStateDb;
|
||||
|
||||
@Inject
|
||||
public FaviconGrpcService(DomainStateDb domainStateDb) {
|
||||
this.domainStateDb = domainStateDb;
|
||||
}
|
||||
|
||||
public boolean shouldRegisterService() {
|
||||
return domainStateDb.isAvailable();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
|
||||
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());
|
||||
|
||||
RpcFaviconResponse response;
|
||||
if (icon.isEmpty()) {
|
||||
response = RpcFaviconResponse.newBuilder().build();
|
||||
}
|
||||
else {
|
||||
var iconRecord = icon.get();
|
||||
response = RpcFaviconResponse.newBuilder()
|
||||
.setContentType(iconRecord.contentType())
|
||||
.setDomain(request.getDomain())
|
||||
.setData(ByteString.copyFrom(iconRecord.imageData()))
|
||||
.build();
|
||||
}
|
||||
|
||||
responseObserver.onNext(response);
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
}
|
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
|
||||
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
||||
|
||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
||||
this(name, poolSize, queueSize, ThreadType.PLATFORM);
|
||||
}
|
||||
|
||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
|
||||
tasks = new ArrayBlockingQueue<>(queueSize);
|
||||
|
||||
for (int i = 0; i < poolSize; i++) {
|
||||
Thread worker = new Thread(this::worker, name + "[" + i + "]");
|
||||
worker.setDaemon(true);
|
||||
worker.start();
|
||||
|
||||
Thread.Builder threadBuilder = switch (threadType) {
|
||||
case VIRTUAL -> Thread.ofVirtual();
|
||||
case PLATFORM -> Thread.ofPlatform().daemon(true);
|
||||
};
|
||||
|
||||
Thread worker = threadBuilder
|
||||
.name(name + "[" + i + "]")
|
||||
.start(this::worker);
|
||||
|
||||
workers.add(worker);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public enum ThreadType {
|
||||
VIRTUAL,
|
||||
PLATFORM
|
||||
}
|
||||
|
||||
public void submit(Task task) throws InterruptedException {
|
||||
tasks.put(task);
|
||||
}
|
||||
|
@@ -105,7 +105,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||
Integer.getInteger("crawler.poolSize", 256),
|
||||
1);
|
||||
1,
|
||||
SimpleBlockingThreadPool.ThreadType.VIRTUAL);
|
||||
|
||||
|
||||
// Wait for the blacklist to be loaded before starting the crawl
|
||||
@@ -221,10 +222,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||
|
||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
||||
// crawling the same server in parallel from different subdomains...
|
||||
Collections.shuffle(crawlSpecRecords);
|
||||
crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
|
||||
|
||||
// First a validation run to ensure the file is all good to parse
|
||||
if (crawlSpecRecords.isEmpty()) {
|
||||
@@ -248,44 +246,35 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||
List<CrawlTask> deferredTasks = new LinkedList<>();
|
||||
List<CrawlTask> taskList = new ArrayList<>();
|
||||
|
||||
// Create crawl tasks and submit them to the pool for execution
|
||||
// Create crawl tasks
|
||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||
if (workLog.isJobFinished(crawlSpec.domain))
|
||||
continue;
|
||||
|
||||
// Add to the end of the deferral list
|
||||
deferredTasks.addLast(new CrawlTask(
|
||||
var task = new CrawlTask(
|
||||
crawlSpec,
|
||||
anchorTagsSource,
|
||||
outputDir,
|
||||
warcArchiver,
|
||||
domainStateDb,
|
||||
workLog));
|
||||
workLog);
|
||||
|
||||
// Start every task we currently can from the deferral list
|
||||
deferredTasks.removeIf(task -> {
|
||||
if (task.canRun()) {
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
|
||||
if (!trySubmitDeferredTask(task)) {
|
||||
// Otherwise add to the taskList for deferred execution
|
||||
taskList.add(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Schedule any lingering tasks for immediate execution
|
||||
for (var task : deferredTasks) {
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
||||
continue;
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
pool.submitQuietly(task);
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -312,6 +301,50 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||
* we want to enqueue domains that tend ro be large and have common top domains first,
|
||||
* but otherwise have a random order.
|
||||
* <p></p>
|
||||
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||
* hashcode based on the fields).
|
||||
* */
|
||||
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||
Random r = new Random();
|
||||
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||
|
||||
for (var spec : records) {
|
||||
randomOrder.put(spec.domain, r.nextInt());
|
||||
}
|
||||
|
||||
return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu"))
|
||||
.reversed()
|
||||
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||
}
|
||||
|
||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||
* or if it can be discarded */
|
||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||
if (!task.canRun()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
try {
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
catch (RuntimeException ex) {
|
||||
logger.error("Failed to submit task " + task.domain, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
|
||||
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
|
||||
}
|
||||
@@ -377,6 +410,11 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
|
||||
if (workLog.isJobFinished(domain)) { // No-Op
|
||||
logger.info("Omitting task {}, as it is already run", domain);
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
@@ -431,7 +469,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept int he workLog
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
@@ -522,7 +560,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
//
|
||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||
if (!inputPath.endsWith(".parquet")) {
|
||||
if (!inputPath.toString().endsWith(".parquet")) {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
|
@@ -1,5 +1,8 @@
|
||||
package nu.marginalia.crawl;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@@ -63,7 +66,29 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||
|
||||
public DomainStateDb(Path filename) throws SQLException {
|
||||
@Inject
|
||||
public DomainStateDb(FileStorageService fileStorageService) throws SQLException {
|
||||
this(findFilename(fileStorageService));
|
||||
}
|
||||
|
||||
private static Path findFilename(FileStorageService fileStorageService) throws SQLException {
|
||||
var fsId = fileStorageService.getOnlyActiveFileStorage(FileStorageType.CRAWL_DATA);
|
||||
|
||||
if (fsId.isPresent()) {
|
||||
var fs = fileStorageService.getStorage(fsId.get());
|
||||
return fs.asPath().resolve("domainstate.db");
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public DomainStateDb(@Nullable Path filename) throws SQLException {
|
||||
if (null == filename) {
|
||||
connection = null;
|
||||
return;
|
||||
}
|
||||
|
||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
|
||||
@@ -90,11 +115,18 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
connection.close();
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isAvailable() {
|
||||
return connection != null;
|
||||
}
|
||||
|
||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||
VALUES(?, ?, ?)
|
||||
@@ -110,6 +142,9 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public Optional<FaviconRecord> getIcon(String domain) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
@@ -130,6 +165,8 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
@@ -146,6 +183,9 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> get(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||
FROM summary
|
||||
|
@@ -30,6 +30,7 @@ import java.net.http.HttpTimeoutException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
|
||||
@@ -60,7 +61,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.cookieHandler(cookies)
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.connectTimeout(Duration.ofSeconds(8))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.executor(Executors.newVirtualThreadPerTaskExecutor())
|
||||
.build();
|
||||
}
|
||||
|
||||
@@ -116,7 +117,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
for (int tries = 0;; tries++) {
|
||||
try {
|
||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||
var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
@@ -153,7 +154,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||
var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||
var headers = rsp.headers();
|
||||
|
||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||
@@ -229,21 +230,24 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 429) {
|
||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||
}
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
if (ok.statusCode() == 200) {
|
||||
return ok;
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 429) {
|
||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||
}
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
if (ok.statusCode() == 200) {
|
||||
return ok;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -317,22 +321,28 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
|
||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||
if (response.statusCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
try (InputStream inputStream = response.body()) {
|
||||
|
||||
InputStream parserStream;
|
||||
if (sitemapUrl.path.endsWith(".gz")) {
|
||||
parserStream = new GZIPInputStream(inputStream);
|
||||
}
|
||||
else {
|
||||
parserStream = inputStream;
|
||||
try (var sl = new SendLock()) {
|
||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||
if (response.statusCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
Document parsedSitemap;
|
||||
|
||||
try (InputStream inputStream = response.body()) {
|
||||
InputStream parserStream;
|
||||
if (sitemapUrl.path.endsWith(".gz")) {
|
||||
parserStream = new GZIPInputStream(inputStream);
|
||||
} else {
|
||||
parserStream = inputStream;
|
||||
}
|
||||
|
||||
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||
}
|
||||
finally {
|
||||
sl.close();
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
@@ -386,7 +396,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try {
|
||||
try (var sl = new SendLock()) {
|
||||
var getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
@@ -429,5 +439,30 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class SendLock implements AutoCloseable {
|
||||
|
||||
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
|
||||
boolean closed = false;
|
||||
|
||||
public SendLock() {
|
||||
maxConcurrentRequests.acquireUninterruptibly();
|
||||
}
|
||||
|
||||
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
|
||||
try (var lock = new SendLock()) {
|
||||
return client.send(request, handler);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (!closed) {
|
||||
maxConcurrentRequests.release();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
|
||||
public void waitFetchDelay(long spentTime) {
|
||||
long sleepTime = delayTime;
|
||||
|
||||
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||
try {
|
||||
if (sleepTime >= 1) {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||
} else {
|
||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||
// within sane limits. This means slower servers get slower crawling, and faster
|
||||
@@ -71,17 +73,17 @@ public class CrawlDelayTimer {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(sleepTime - spentTime);
|
||||
Thread.sleep(sleepTime - spentTime + jitter);
|
||||
}
|
||||
|
||||
if (slowDown) {
|
||||
// Additional delay when the server is signalling it wants slower requests
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException();
|
||||
throw new RuntimeException("Interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -108,6 +108,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
// There's a small chance we're interrupted during the sniffing portion
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
@@ -140,7 +145,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
CrawlDelayTimer delayTimer,
|
||||
DomainLinks domainLinks) {
|
||||
|
||||
|
||||
// Add external links to the crawl frontier
|
||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||
|
||||
@@ -289,6 +293,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error configuring link filter", ex);
|
||||
if (Thread.interrupted()) {
|
||||
Thread.currentThread().interrupt();
|
||||
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
|
||||
}
|
||||
}
|
||||
finally {
|
||||
crawlFrontier.addVisited(rootUrl);
|
||||
|
@@ -46,6 +46,10 @@ public class CrawlerRevisitor {
|
||||
break;
|
||||
}
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||
if (urlMaybe.isEmpty())
|
||||
continue;
|
||||
|
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
|
||||
if (fileName.endsWith(".slop.zip")) {
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
if (fileName.endsWith(".slop.zip")) {
|
||||
else if (fileName.endsWith(".parquet")) {
|
||||
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||
try {
|
||||
return new SlopSerializableCrawlDataStream(fullPath);
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
|
@@ -41,6 +41,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
|
@@ -3,10 +3,14 @@ package nu.marginalia.search;
|
||||
import com.google.inject.Inject;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.Jooby;
|
||||
import io.jooby.MediaType;
|
||||
import io.jooby.StatusCode;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.favicon.FaviconClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
@@ -15,11 +19,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
public class SearchService extends JoobyService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchSiteSubscriptionService siteSubscriptionService;
|
||||
private final FaviconClient faviconClient;
|
||||
private final DbDomainQueries domainQueries;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||
@@ -43,6 +50,8 @@ public class SearchService extends JoobyService {
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchCrosstalkService crosstalkService,
|
||||
SearchBrowseService searchBrowseService,
|
||||
FaviconClient faviconClient,
|
||||
DbDomainQueries domainQueries,
|
||||
SearchQueryService searchQueryService)
|
||||
throws Exception {
|
||||
super(params,
|
||||
@@ -58,6 +67,8 @@ public class SearchService extends JoobyService {
|
||||
this.websiteUrl = websiteUrl;
|
||||
|
||||
this.siteSubscriptionService = siteSubscriptionService;
|
||||
this.faviconClient = faviconClient;
|
||||
this.domainQueries = domainQueries;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -71,6 +82,32 @@ public class SearchService extends JoobyService {
|
||||
jooby.get("/site/https://*", this::handleSiteUrlRedirect);
|
||||
jooby.get("/site/http://*", this::handleSiteUrlRedirect);
|
||||
|
||||
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
|
||||
jooby.get("/site/{domain}/favicon", ctx -> {
|
||||
String domain = ctx.path("domain").value();
|
||||
logger.info("Finding icon for domain {}", domain);
|
||||
try {
|
||||
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
|
||||
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
|
||||
|
||||
if (faviconMaybe.isEmpty()) {
|
||||
ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
|
||||
return emptySvg;
|
||||
} else {
|
||||
var favicon = faviconMaybe.get();
|
||||
|
||||
ctx.responseStream(MediaType.valueOf(favicon.contentType()), consumer -> {
|
||||
consumer.write(favicon.bytes());
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (NoSuchElementException ex) {
|
||||
ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
|
||||
return emptySvg;
|
||||
}
|
||||
return "";
|
||||
});
|
||||
|
||||
jooby.before((Context ctx) -> {
|
||||
ctx.setAttribute(startTimeAttribute, System.nanoTime());
|
||||
});
|
||||
|
@@ -81,6 +81,7 @@ public class SearchFilters {
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
|
@@ -10,7 +10,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - Explore")
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans ">
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans ">
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
</header>
|
||||
|
||||
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
|
||||
<div class="border dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
||||
<div class="border border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
||||
@if (results.hasFocusDomain())
|
||||
<div class="flex space-x-1">
|
||||
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
|
||||
@@ -36,7 +36,7 @@
|
||||
</div>
|
||||
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
|
||||
@for (BrowseResult result : results.results())
|
||||
<div class="bg-white border dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
||||
<div class="bg-white border border-gray-300 dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
||||
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
|
||||
<span class="break-words">${result.displayDomain()}</span>
|
||||
<div class="grow"></div>
|
||||
|
@@ -9,6 +9,15 @@
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
bgblue: '#e5e9eb',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
<div>
|
||||
|
@@ -9,7 +9,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - Error")
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
|
@@ -11,7 +11,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - " + results.getQuery())
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
<div>
|
||||
@@ -23,7 +23,7 @@
|
||||
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
|
||||
<i class="fas fa-filter mr-3"></i>
|
||||
Filters
|
||||
</button>
|
||||
|
@@ -26,15 +26,15 @@
|
||||
It operates a bit like a clock, starting at the top and working its way around clockwise.</p>
|
||||
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 90)
|
||||
@template.serp.part.matchogram(mask = 90, domain = "example.com")
|
||||
<div>This is by the beginning</div>
|
||||
</div>
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 90L<<26)
|
||||
@template.serp.part.matchogram(mask = 90L<<26, domain = "example.com")
|
||||
<div>This is in the middle</div>
|
||||
</div>
|
||||
<div class="flex gap-4 place-items-middle">
|
||||
@template.serp.part.matchogram(mask = 5L<<48)
|
||||
@template.serp.part.matchogram(mask = 5L<<48, domain = "example.com")
|
||||
<div>This is toward the end</div>
|
||||
</div>
|
||||
|
||||
|
@@ -1,11 +1,13 @@
|
||||
@import java.util.stream.IntStream
|
||||
|
||||
@param long mask
|
||||
@param String domain
|
||||
|
||||
<svg width="40" height="40">
|
||||
<svg width="40" height="40"
|
||||
style="background-image: url('/site/${domain}/favicon'); background-repeat: no-repeat; background-size: 16px 16px; background-position: center; ">
|
||||
<circle
|
||||
cx="18"
|
||||
cy="18"
|
||||
cx="20"
|
||||
cy="20"
|
||||
r="16"
|
||||
fill="none"
|
||||
stroke="#eee"
|
||||
@@ -13,10 +15,10 @@
|
||||
/>
|
||||
@for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray())
|
||||
<line
|
||||
x1="${18 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y1="${18 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x2="${18 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y2="${18 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x1="${20 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y1="${20 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
x2="${20 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
|
||||
y2="${20 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
|
||||
stroke="#444"
|
||||
stroke-width="2"
|
||||
/>
|
||||
|
@@ -12,7 +12,7 @@
|
||||
<div class="flex flex-col grow" >
|
||||
<div class="flex flex-row space-x-2 place-items-center">
|
||||
<div class="flex-0" title="Match density">
|
||||
@template.serp.part.matchogram(mask = result.first.positionsMask)
|
||||
@template.serp.part.matchogram(mask = result.first.positionsMask, domain=result.getFirst().url.domain.toString())
|
||||
</div>
|
||||
<div class="flex grow justify-between items-start">
|
||||
<div class="flex-1">
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
@param SearchFilters filters
|
||||
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
|
||||
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
|
||||
<div class="space-y-6 sticky top-4">
|
||||
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
|
||||
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">
|
||||
@@ -13,7 +13,7 @@
|
||||
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
|
||||
@for (SearchFilters.Filter filter : filterGroup)
|
||||
<label class="flex items-center">
|
||||
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||
@if (filter.current)
|
||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||
@else
|
||||
@@ -38,7 +38,7 @@
|
||||
<div class="space-y-2">
|
||||
@for (SearchFilters.SearchOption option : filters.searchOptions())
|
||||
<label class="flex items-center">
|
||||
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||
@if (option.isSet())
|
||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||
@else
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search", allowIndexing = true)
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
@@ -32,18 +32,14 @@
|
||||
|
||||
@if (model.news().isEmpty())
|
||||
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
||||
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||
<div class="border border-gray-300 border-gray-100 dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||
This is the new design and home of Marginalia Search.
|
||||
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
<p class="my-4"></p>
|
||||
The old version of Marginalia Search remains available at
|
||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||
The old version of Marginalia Search remains available
|
||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||
</div>
|
||||
</div>
|
||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
||||
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Prioritizes non-commercial content</li>
|
||||
@@ -52,7 +48,7 @@
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Custom index and crawler software</li>
|
||||
@@ -65,7 +61,7 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||
<li>Filter out tracking and adtech</li>
|
||||
|
@@ -13,7 +13,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
<div>
|
||||
|
@@ -11,7 +11,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
|
@@ -9,7 +9,7 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - " + model.domain())
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
|
@@ -7,7 +7,7 @@
|
||||
|
||||
@if (!list.isEmpty())
|
||||
|
||||
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border dark:border-gray-600">
|
||||
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border border-gray-300 dark:border-gray-600">
|
||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
|
||||
<h2 class="text-md">${title}</h2>
|
||||
<div class="grow"></div>
|
||||
|
@@ -9,11 +9,11 @@
|
||||
|
||||
@template.part.head(title = "Marginalia Search - Site Viewer")
|
||||
|
||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
||||
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||
|
||||
@template.part.navbar(navbar = navbar)
|
||||
|
||||
<header class="border-b border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
||||
<header class="border-b border-gray-300 border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
||||
<div class="max-w-[1400px] mx-auto px-4 py-4">
|
||||
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
|
||||
</div>
|
||||
@@ -22,7 +22,7 @@
|
||||
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
|
||||
|
||||
|
||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
||||
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
|
||||
|
||||
<p class="mx-4">This utility lets you explore what the search engine knows about the web,
|
||||
@@ -45,7 +45,7 @@
|
||||
</div>
|
||||
|
||||
@if (!model.domains().isEmpty())
|
||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
||||
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>
|
||||
|
||||
|
||||
|
@@ -8,17 +8,17 @@
|
||||
<div class="flex flex-col space-y-4 my-4 w-full">
|
||||
|
||||
@if (backlinks.results().isEmpty())
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
||||
The search engine isn't aware of any backlinks to ${backlinks.domain()}!
|
||||
</div>
|
||||
@else
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
Showing documents linking to ${backlinks.domain()}
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@for (GroupedUrlDetails group : backlinks.results())
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||
<div class="border dark:border-gray-600 border-gray-300 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
|
||||
<span class="fas fa-globe"></span>
|
||||
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>
|
||||
|
@@ -9,17 +9,17 @@
|
||||
<div class="flex flex-col space-y-4 my-4">
|
||||
|
||||
@if (docs.results().isEmpty())
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
The search engine doesn't index any documents from ${docs.domain()}
|
||||
</div>
|
||||
@else
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||
Showing documents from ${docs.domain()}
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@for (UrlDetails details : docs.results())
|
||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||
<div class="flex grow justify-between items-start p-4">
|
||||
<div class="flex-1">
|
||||
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">
|
||||
|
@@ -8,9 +8,9 @@
|
||||
<!-- Main content -->
|
||||
|
||||
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
||||
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||
<div class="flex place-items-baseline space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<i class="fa fa-globe"></i>
|
||||
<div class="flex border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
||||
<span>${siteInfo.domain()}</span>
|
||||
<div class="grow">
|
||||
</div>
|
||||
|
@@ -4,7 +4,7 @@
|
||||
@param ReportDomain reportDomain
|
||||
|
||||
<div class="flex-col mx-auto">
|
||||
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
||||
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border border-gray-300 dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
|
||||
<h2 class="text-md">Report Domain Issue</h2>
|
||||
</div>
|
||||
|
@@ -9,6 +9,15 @@ module.exports = {
|
||||
nicotine: '#f8f8ee',
|
||||
margeblue: '#3e5f6f',
|
||||
liteblue: '#0066cc',
|
||||
bgblue: '#e5e9eb',
|
||||
},
|
||||
screens: {
|
||||
'coarsepointer': {
|
||||
'raw': '(pointer: coarse)'
|
||||
},
|
||||
'finepointer': {
|
||||
'raw': '(pointer: fine)'
|
||||
},
|
||||
}
|
||||
},
|
||||
screens: {
|
||||
|
@@ -42,6 +42,8 @@ dependencies {
|
||||
implementation project(':code:libraries:message-queue')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:favicon')
|
||||
implementation project(':code:functions:favicon:api')
|
||||
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
implementation project(':code:processes:crawling-process:model')
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.executor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.execution.*;
|
||||
import nu.marginalia.functions.favicon.FaviconGrpcService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
@@ -24,6 +25,7 @@ public class ExecutorSvc extends SparkService {
|
||||
ExecutorCrawlGrpcService executorCrawlGrpcService,
|
||||
ExecutorSideloadGrpcService executorSideloadGrpcService,
|
||||
ExecutorExportGrpcService executorExportGrpcService,
|
||||
FaviconGrpcService faviconGrpcService,
|
||||
ExecutionInit executionInit,
|
||||
ExecutorFileTransferService fileTransferService) throws Exception {
|
||||
super(params,
|
||||
@@ -31,7 +33,8 @@ public class ExecutorSvc extends SparkService {
|
||||
List.of(executorGrpcService,
|
||||
executorCrawlGrpcService,
|
||||
executorSideloadGrpcService,
|
||||
executorExportGrpcService)
|
||||
executorExportGrpcService,
|
||||
faviconGrpcService)
|
||||
);
|
||||
|
||||
this.executionInit = executionInit;
|
||||
|
@@ -16,7 +16,8 @@ include 'code:services-application:status-service'
|
||||
|
||||
include 'code:functions:math'
|
||||
include 'code:functions:math:api'
|
||||
|
||||
include 'code:functions:favicon'
|
||||
include 'code:functions:favicon:api'
|
||||
include 'code:functions:domain-info'
|
||||
include 'code:functions:domain-info:api'
|
||||
|
||||
|
Reference in New Issue
Block a user