mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
14 Commits
deploy-010
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 | ||
|
4dcff14498 | ||
|
426658f64e | ||
|
2181b22f05 |
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
ext {
|
ext {
|
||||||
jvmVersion=23
|
jvmVersion = 24
|
||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.4'
|
jibVersion = '3.4.4'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
idea {
|
idea {
|
||||||
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
|||||||
@Nonnull
|
@Nonnull
|
||||||
public final String topDomain;
|
public final String topDomain;
|
||||||
|
|
||||||
public EdgeDomain(String host) {
|
public EdgeDomain(@Nonnull String host) {
|
||||||
Objects.requireNonNull(host, "domain name must not be null");
|
Objects.requireNonNull(host, "domain name must not be null");
|
||||||
|
|
||||||
host = host.toLowerCase();
|
host = host.toLowerCase();
|
||||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
|||||||
this.topDomain = topDomain;
|
this.topDomain = topDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getTopDomain(String host) {
|
||||||
|
return new EdgeDomain(host).topDomain;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean looksLikeGovTld(String host) {
|
private boolean looksLikeGovTld(String host) {
|
||||||
if (host.length() < 8)
|
if (host.length() < 8)
|
||||||
return false;
|
return false;
|
||||||
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
|
|||||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLongDomainKey() {
|
|
||||||
StringBuilder ret = new StringBuilder();
|
|
||||||
|
|
||||||
int cutPoint = topDomain.indexOf('.');
|
|
||||||
if (cutPoint < 0) {
|
|
||||||
ret.append(topDomain);
|
|
||||||
} else {
|
|
||||||
ret.append(topDomain, 0, cutPoint);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
|
||||||
ret.append(":");
|
|
||||||
ret.append(subDomain);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.toString().toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/** If possible, try to provide an alias domain,
|
/** If possible, try to provide an alias domain,
|
||||||
* i.e. a domain name that is very likely to link to this one
|
* i.e. a domain name that is very likely to link to this one
|
||||||
* */
|
* */
|
||||||
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
|||||||
class ZkServiceRegistryTest {
|
class ZkServiceRegistryTest {
|
||||||
private static final int ZOOKEEPER_PORT = 2181;
|
private static final int ZOOKEEPER_PORT = 2181;
|
||||||
private static final GenericContainer<?> zookeeper =
|
private static final GenericContainer<?> zookeeper =
|
||||||
new GenericContainer<>("zookeeper:3.8.0")
|
new GenericContainer<>("zookeeper:3.8")
|
||||||
.withExposedPorts(ZOOKEEPER_PORT);
|
.withExposedPorts(ZOOKEEPER_PORT);
|
||||||
|
|
||||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||||
|
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
||||||
|
|
||||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
||||||
|
this(name, poolSize, queueSize, ThreadType.PLATFORM);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
|
||||||
tasks = new ArrayBlockingQueue<>(queueSize);
|
tasks = new ArrayBlockingQueue<>(queueSize);
|
||||||
|
|
||||||
for (int i = 0; i < poolSize; i++) {
|
for (int i = 0; i < poolSize; i++) {
|
||||||
Thread worker = new Thread(this::worker, name + "[" + i + "]");
|
|
||||||
worker.setDaemon(true);
|
Thread.Builder threadBuilder = switch (threadType) {
|
||||||
worker.start();
|
case VIRTUAL -> Thread.ofVirtual();
|
||||||
|
case PLATFORM -> Thread.ofPlatform().daemon(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
Thread worker = threadBuilder
|
||||||
|
.name(name + "[" + i + "]")
|
||||||
|
.start(this::worker);
|
||||||
|
|
||||||
workers.add(worker);
|
workers.add(worker);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public enum ThreadType {
|
||||||
|
VIRTUAL,
|
||||||
|
PLATFORM
|
||||||
|
}
|
||||||
|
|
||||||
public void submit(Task task) throws InterruptedException {
|
public void submit(Task task) throws InterruptedException {
|
||||||
tasks.put(task);
|
tasks.put(task);
|
||||||
}
|
}
|
||||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -106,9 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
|
SimpleBlockingThreadPool.ThreadType threadType;
|
||||||
|
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
|
||||||
|
}
|
||||||
|
|
||||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||||
Integer.getInteger("crawler.poolSize", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1,
|
||||||
|
threadType);
|
||||||
|
|
||||||
|
|
||||||
// Wait for the blacklist to be loaded before starting the crawl
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
@@ -224,10 +230,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||||
|
|
||||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
|
||||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
|
||||||
// crawling the same server in parallel from different subdomains...
|
|
||||||
Collections.shuffle(crawlSpecRecords);
|
|
||||||
|
|
||||||
// First a validation run to ensure the file is all good to parse
|
// First a validation run to ensure the file is all good to parse
|
||||||
if (crawlSpecRecords.isEmpty()) {
|
if (crawlSpecRecords.isEmpty()) {
|
||||||
@@ -306,6 +309,30 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||||
|
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
||||||
|
* order.
|
||||||
|
* <p></p>
|
||||||
|
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||||
|
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||||
|
* hashcode based on the fields).
|
||||||
|
* */
|
||||||
|
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||||
|
Random r = new Random();
|
||||||
|
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
||||||
|
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||||
|
|
||||||
|
for (var spec : records) {
|
||||||
|
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
||||||
|
randomOrder.put(spec.domain, r.nextInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
|
||||||
|
.reversed()
|
||||||
|
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||||
|
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||||
|
}
|
||||||
|
|
||||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||||
* or if it can be discarded */
|
* or if it can be discarded */
|
||||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||||
|
@@ -11,6 +11,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
|
|
||||||
|
|
||||||
|
public record CrawlMeta(
|
||||||
|
String domainName,
|
||||||
|
Instant lastFullCrawl,
|
||||||
|
Duration recrawlTime,
|
||||||
|
Duration crawlTime,
|
||||||
|
int recrawlErrors,
|
||||||
|
int crawlChanges,
|
||||||
|
int totalCrawlSize
|
||||||
|
) {}
|
||||||
|
|
||||||
public record SummaryRecord(
|
public record SummaryRecord(
|
||||||
String domainName,
|
String domainName,
|
||||||
Instant lastUpdated,
|
Instant lastUpdated,
|
||||||
@@ -102,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
feedUrl TEXT
|
feedUrl TEXT
|
||||||
)
|
)
|
||||||
""");
|
""");
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS crawl_meta (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
lastFullCrawlEpochMs LONG NOT NULL,
|
||||||
|
recrawlTimeMs LONG NOT NULL,
|
||||||
|
recrawlErrors INTEGER NOT NULL,
|
||||||
|
crawlTimeMs LONG NOT NULL,
|
||||||
|
crawlChanges INTEGER NOT NULL,
|
||||||
|
totalCrawlSize INTEGER NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
stmt.executeUpdate("""
|
stmt.executeUpdate("""
|
||||||
CREATE TABLE IF NOT EXISTS favicon (
|
CREATE TABLE IF NOT EXISTS favicon (
|
||||||
domain TEXT PRIMARY KEY,
|
domain TEXT PRIMARY KEY,
|
||||||
@@ -164,6 +187,26 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void save(CrawlMeta crawlMeta) {
|
||||||
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, crawlMeta.domainName());
|
||||||
|
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
|
||||||
|
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
|
||||||
|
stmt.setInt(4, crawlMeta.recrawlErrors);
|
||||||
|
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
|
||||||
|
stmt.setInt(6, crawlMeta.crawlChanges);
|
||||||
|
stmt.setInt(7, crawlMeta.totalCrawlSize);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to insert crawl meta record", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void save(SummaryRecord record) {
|
public void save(SummaryRecord record) {
|
||||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||||
|
|
||||||
@@ -182,7 +225,35 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<SummaryRecord> get(String domainName) {
|
public Optional<CrawlMeta> getMeta(String domainName) {
|
||||||
|
if (connection == null)
|
||||||
|
return Optional.empty();
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
|
||||||
|
FROM crawl_meta
|
||||||
|
WHERE domain = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(new CrawlMeta(
|
||||||
|
rs.getString("domain"),
|
||||||
|
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
|
||||||
|
Duration.ofMillis(rs.getLong("crawlTimeMs")),
|
||||||
|
rs.getInt("recrawlErrors"),
|
||||||
|
rs.getInt("crawlChanges"),
|
||||||
|
rs.getInt("totalCrawlSize")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} catch (SQLException ex) {
|
||||||
|
logger.error("Failed to get crawl meta record", ex);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<SummaryRecord> getSummary(String domainName) {
|
||||||
if (connection == null)
|
if (connection == null)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
|
||||||
|
@@ -29,6 +29,7 @@ import java.net.http.HttpResponse;
|
|||||||
import java.net.http.HttpTimeoutException;
|
import java.net.http.HttpTimeoutException;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.Semaphore;
|
import java.util.concurrent.Semaphore;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
@@ -56,12 +57,21 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private final HttpClient client;
|
private final HttpClient client;
|
||||||
|
|
||||||
private HttpClient createClient() {
|
private HttpClient createClient() {
|
||||||
|
final ExecutorService executorService;
|
||||||
|
|
||||||
|
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
|
||||||
|
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
executorService = Executors.newCachedThreadPool();
|
||||||
|
}
|
||||||
|
|
||||||
return HttpClient.newBuilder()
|
return HttpClient.newBuilder()
|
||||||
.sslContext(NoSecuritySSL.buildSslContext())
|
.sslContext(NoSecuritySSL.buildSslContext())
|
||||||
.cookieHandler(cookies)
|
.cookieHandler(cookies)
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
.connectTimeout(Duration.ofSeconds(8))
|
.connectTimeout(Duration.ofSeconds(8))
|
||||||
.executor(Executors.newCachedThreadPool())
|
.executor(executorService)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -444,7 +454,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
class SendLock implements AutoCloseable {
|
class SendLock implements AutoCloseable {
|
||||||
|
|
||||||
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 100));
|
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
|
||||||
boolean closed = false;
|
boolean closed = false;
|
||||||
|
|
||||||
public SendLock() {
|
public SendLock() {
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
|
|||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||||
try {
|
try {
|
||||||
if (sleepTime >= 1) {
|
if (sleepTime >= 1) {
|
||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||||
} else {
|
} else {
|
||||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||||
// within sane limits. This means slower servers get slower crawling, and faster
|
// within sane limits. This means slower servers get slower crawling, and faster
|
||||||
@@ -71,17 +73,17 @@ public class CrawlDelayTimer {
|
|||||||
if (spentTime > sleepTime)
|
if (spentTime > sleepTime)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
Thread.sleep(sleepTime - spentTime);
|
Thread.sleep(sleepTime - spentTime + jitter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slowDown) {
|
if (slowDown) {
|
||||||
// Additional delay when the server is signalling it wants slower requests
|
// Additional delay when the server is signalling it wants slower requests
|
||||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException e) {
|
catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
throw new RuntimeException();
|
throw new RuntimeException("Interrupted", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -26,6 +26,8 @@ import java.io.IOException;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@@ -108,15 +110,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||||
domainStateDb.save(summaryRecord);
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
// There's a small chance we're interrupted during the sniffing portion
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
|
Instant recrawlStart = Instant.now();
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
|
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
if (recrawlMetadata.size() > 0) {
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||||
|
|
||||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
|
||||||
}
|
}
|
||||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
@@ -138,8 +149,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private int crawlDomain(EdgeUrl rootUrl,
|
private int crawlDomain(EdgeUrl rootUrl,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer,
|
CrawlDelayTimer delayTimer,
|
||||||
DomainLinks domainLinks) {
|
DomainLinks domainLinks,
|
||||||
|
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
|
||||||
|
Duration recrawlTime) {
|
||||||
|
|
||||||
|
Instant crawlStart = Instant.now();
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
@@ -149,6 +163,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int crawlerAdditions = 0;
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
&& errorCount < MAX_ERRORS
|
&& errorCount < MAX_ERRORS
|
||||||
@@ -180,7 +196,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||||
|
|
||||||
|
if (result.isOk()) {
|
||||||
|
crawlerAdditions++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
@@ -188,6 +208,17 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Duration crawlTime = Duration.between(crawlStart, Instant.now());
|
||||||
|
domainStateDb.save(new DomainStateDb.CrawlMeta(
|
||||||
|
domain,
|
||||||
|
Instant.now(),
|
||||||
|
recrawlTime,
|
||||||
|
crawlTime,
|
||||||
|
recrawlMetadata.errors(),
|
||||||
|
crawlerAdditions,
|
||||||
|
recrawlMetadata.size() + crawlerAdditions
|
||||||
|
));
|
||||||
|
|
||||||
return crawlFrontier.visitedSize();
|
return crawlFrontier.visitedSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -289,6 +320,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
crawlFrontier.addVisited(rootUrl);
|
crawlFrontier.addVisited(rootUrl);
|
||||||
@@ -316,7 +351,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
);
|
);
|
||||||
|
|
||||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
var oldDomainStateRecord = domainStateDb.get(domain);
|
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||||
|
|
||||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||||
if (oldDomainStateRecord.isPresent()) {
|
if (oldDomainStateRecord.isPresent()) {
|
||||||
|
@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||||
public int recrawl(CrawlDataReference oldCrawlData,
|
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
CrawlDelayTimer delayTimer)
|
CrawlDelayTimer delayTimer)
|
||||||
throws InterruptedException {
|
throws InterruptedException {
|
||||||
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
|
|||||||
int retained = 0;
|
int retained = 0;
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
int skipped = 0;
|
int skipped = 0;
|
||||||
|
int size = 0;
|
||||||
|
|
||||||
for (CrawledDocument doc : oldCrawlData) {
|
for (CrawledDocument doc : oldCrawlData) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
@@ -46,6 +47,10 @@ public class CrawlerRevisitor {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
if (urlMaybe.isEmpty())
|
if (urlMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
@@ -78,6 +83,7 @@ public class CrawlerRevisitor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size++;
|
||||||
|
|
||||||
double skipProb;
|
double skipProb;
|
||||||
|
|
||||||
@@ -150,6 +156,8 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return recrawled;
|
return new RecrawlMetadata(size, errors, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record RecrawlMetadata(int size, int errors, int skipped) {}
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
@@ -47,8 +48,8 @@ class DomainStateDbTest {
|
|||||||
db.save(allFields);
|
db.save(allFields);
|
||||||
db.save(minFields);
|
db.save(minFields);
|
||||||
|
|
||||||
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||||
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
|
||||||
|
|
||||||
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||||
"all.marginalia.nu",
|
"all.marginalia.nu",
|
||||||
@@ -59,7 +60,19 @@ class DomainStateDbTest {
|
|||||||
);
|
);
|
||||||
|
|
||||||
db.save(updatedAllFields);
|
db.save(updatedAllFields);
|
||||||
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMetadata() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
|
||||||
|
db.save(original);
|
||||||
|
|
||||||
|
var maybeMeta = db.getMeta("example.com");
|
||||||
|
assertTrue(maybeMeta.isPresent());
|
||||||
|
assertEquals(original, maybeMeta.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Explore")
|
@template.part.head(title = "Marginalia Search - Explore")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans ">
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans ">
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@
|
|||||||
</header>
|
</header>
|
||||||
|
|
||||||
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
|
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
|
||||||
<div class="border dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
<div class="border border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
|
||||||
@if (results.hasFocusDomain())
|
@if (results.hasFocusDomain())
|
||||||
<div class="flex space-x-1">
|
<div class="flex space-x-1">
|
||||||
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
|
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
|
||||||
@@ -36,7 +36,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
|
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
|
||||||
@for (BrowseResult result : results.results())
|
@for (BrowseResult result : results.results())
|
||||||
<div class="bg-white border dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
<div class="bg-white border border-gray-300 dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
|
||||||
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
|
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
|
||||||
<span class="break-words">${result.displayDomain()}</span>
|
<span class="break-words">${result.displayDomain()}</span>
|
||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
|
@@ -9,6 +9,7 @@
|
|||||||
nicotine: '#f8f8ee',
|
nicotine: '#f8f8ee',
|
||||||
margeblue: '#3e5f6f',
|
margeblue: '#3e5f6f',
|
||||||
liteblue: '#0066cc',
|
liteblue: '#0066cc',
|
||||||
|
bgblue: '#e5e9eb',
|
||||||
},
|
},
|
||||||
screens: {
|
screens: {
|
||||||
'coarsepointer': {
|
'coarsepointer': {
|
||||||
|
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Error")
|
@template.part.head(title = "Marginalia Search - Error")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + results.getQuery())
|
@template.part.head(title = "Marginalia Search - " + results.getQuery())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@@ -13,7 +13,7 @@
|
|||||||
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
|
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
|
||||||
@for (SearchFilters.Filter filter : filterGroup)
|
@for (SearchFilters.Filter filter : filterGroup)
|
||||||
<label class="flex items-center">
|
<label class="flex items-center">
|
||||||
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||||
@if (filter.current)
|
@if (filter.current)
|
||||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||||
@else
|
@else
|
||||||
@@ -38,7 +38,7 @@
|
|||||||
<div class="space-y-2">
|
<div class="space-y-2">
|
||||||
@for (SearchFilters.SearchOption option : filters.searchOptions())
|
@for (SearchFilters.SearchOption option : filters.searchOptions())
|
||||||
<label class="flex items-center">
|
<label class="flex items-center">
|
||||||
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
|
||||||
@if (option.isSet())
|
@if (option.isSet())
|
||||||
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
<input type="checkbox" checked class="sr-only" aria-checked="true" />
|
||||||
@else
|
@else
|
||||||
|
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search", allowIndexing = true)
|
@template.part.head(title = "Marginalia Search", allowIndexing = true)
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
@@ -32,18 +32,14 @@
|
|||||||
|
|
||||||
@if (model.news().isEmpty())
|
@if (model.news().isEmpty())
|
||||||
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
|
||||||
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
<div class="border border-gray-300 border-gray-100 dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
|
||||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
The old version of Marginalia Search remains available
|
||||||
This is the new design and home of Marginalia Search.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
|
||||||
<p class="my-4"></p>
|
|
||||||
The old version of Marginalia Search remains available at
|
|
||||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
|
||||||
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Prioritizes non-commercial content</li>
|
<li>Prioritizes non-commercial content</li>
|
||||||
@@ -52,7 +48,7 @@
|
|||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||||
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Custom index and crawler software</li>
|
<li>Custom index and crawler software</li>
|
||||||
@@ -65,7 +61,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
<div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
|
||||||
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
|
||||||
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
|
||||||
<li>Filter out tracking and adtech</li>
|
<li>Filter out tracking and adtech</li>
|
||||||
|
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
@template.part.head(title = "Marginalia Search - " + parameters.query())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
|
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - " + model.domain())
|
@template.part.head(title = "Marginalia Search - " + model.domain())
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
|
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
@if (!list.isEmpty())
|
@if (!list.isEmpty())
|
||||||
|
|
||||||
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border dark:border-gray-600">
|
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border border-gray-300 dark:border-gray-600">
|
||||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
|
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
|
||||||
<h2 class="text-md">${title}</h2>
|
<h2 class="text-md">${title}</h2>
|
||||||
<div class="grow"></div>
|
<div class="grow"></div>
|
||||||
|
@@ -9,11 +9,11 @@
|
|||||||
|
|
||||||
@template.part.head(title = "Marginalia Search - Site Viewer")
|
@template.part.head(title = "Marginalia Search - Site Viewer")
|
||||||
|
|
||||||
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " >
|
<body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
|
||||||
|
|
||||||
@template.part.navbar(navbar = navbar)
|
@template.part.navbar(navbar = navbar)
|
||||||
|
|
||||||
<header class="border-b border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
<header class="border-b border-gray-300 border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
|
||||||
<div class="max-w-[1400px] mx-auto px-4 py-4">
|
<div class="max-w-[1400px] mx-auto px-4 py-4">
|
||||||
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
|
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
|
||||||
</div>
|
</div>
|
||||||
@@ -22,7 +22,7 @@
|
|||||||
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
|
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
|
||||||
|
|
||||||
|
|
||||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
|
||||||
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
|
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
|
||||||
|
|
||||||
<p class="mx-4">This utility lets you explore what the search engine knows about the web,
|
<p class="mx-4">This utility lets you explore what the search engine knows about the web,
|
||||||
@@ -45,7 +45,7 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
@if (!model.domains().isEmpty())
|
@if (!model.domains().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
<div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
|
||||||
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>
|
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>
|
||||||
|
|
||||||
|
|
||||||
|
@@ -8,17 +8,17 @@
|
|||||||
<div class="flex flex-col space-y-4 my-4 w-full">
|
<div class="flex flex-col space-y-4 my-4 w-full">
|
||||||
|
|
||||||
@if (backlinks.results().isEmpty())
|
@if (backlinks.results().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
|
||||||
The search engine isn't aware of any backlinks to ${backlinks.domain()}!
|
The search engine isn't aware of any backlinks to ${backlinks.domain()}!
|
||||||
</div>
|
</div>
|
||||||
@else
|
@else
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
Showing documents linking to ${backlinks.domain()}
|
Showing documents linking to ${backlinks.domain()}
|
||||||
</div>
|
</div>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@for (GroupedUrlDetails group : backlinks.results())
|
@for (GroupedUrlDetails group : backlinks.results())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
<div class="border dark:border-gray-600 border-gray-300 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||||
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
|
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
|
||||||
<span class="fas fa-globe"></span>
|
<span class="fas fa-globe"></span>
|
||||||
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>
|
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>
|
||||||
|
@@ -9,17 +9,17 @@
|
|||||||
<div class="flex flex-col space-y-4 my-4">
|
<div class="flex flex-col space-y-4 my-4">
|
||||||
|
|
||||||
@if (docs.results().isEmpty())
|
@if (docs.results().isEmpty())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
The search engine doesn't index any documents from ${docs.domain()}
|
The search engine doesn't index any documents from ${docs.domain()}
|
||||||
</div>
|
</div>
|
||||||
@else
|
@else
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
|
||||||
Showing documents from ${docs.domain()}
|
Showing documents from ${docs.domain()}
|
||||||
</div>
|
</div>
|
||||||
@endif
|
@endif
|
||||||
|
|
||||||
@for (UrlDetails details : docs.results())
|
@for (UrlDetails details : docs.results())
|
||||||
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
<div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
|
||||||
<div class="flex grow justify-between items-start p-4">
|
<div class="flex grow justify-between items-start p-4">
|
||||||
<div class="flex-1">
|
<div class="flex-1">
|
||||||
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">
|
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
<!-- Main content -->
|
<!-- Main content -->
|
||||||
|
|
||||||
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
|
||||||
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
<div class="flex border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
|
||||||
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
|
||||||
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
|
||||||
<span>${siteInfo.domain()}</span>
|
<span>${siteInfo.domain()}</span>
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
@param ReportDomain reportDomain
|
@param ReportDomain reportDomain
|
||||||
|
|
||||||
<div class="flex-col mx-auto">
|
<div class="flex-col mx-auto">
|
||||||
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border border-gray-300 dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
|
||||||
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
|
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
|
||||||
<h2 class="text-md">Report Domain Issue</h2>
|
<h2 class="text-md">Report Domain Issue</h2>
|
||||||
</div>
|
</div>
|
||||||
|
@@ -9,6 +9,7 @@ module.exports = {
|
|||||||
nicotine: '#f8f8ee',
|
nicotine: '#f8f8ee',
|
||||||
margeblue: '#3e5f6f',
|
margeblue: '#3e5f6f',
|
||||||
liteblue: '#0066cc',
|
liteblue: '#0066cc',
|
||||||
|
bgblue: '#e5e9eb',
|
||||||
},
|
},
|
||||||
screens: {
|
screens: {
|
||||||
'coarsepointer': {
|
'coarsepointer': {
|
||||||
|
Reference in New Issue
Block a user