1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

24 Commits

Author SHA1 Message Date
Viktor Lofgren
c91af247e9 (rate-limit) Fix rate limiting logic
The rate limiter was misconfigured to regenerate tokens at a fixed rate of 1 per refillRate; not refillRate per minute.  Additionally increasing the default bucket size to 4x refill rate.
2025-04-05 12:26:26 +02:00
Viktor Lofgren
7a31227de1 (crawler) Filter out robots.txt-sitemaps that belong to different domains 2025-04-02 13:35:37 +02:00
Viktor Lofgren
4f477604c5 (crawler) Improve error handling in parquet->slop conversion
Parquet code throws a RuntimeException, which was not correctly caught, leading to a failure to crawl.
2025-04-02 13:16:01 +02:00
Viktor Lofgren
2970f4395b (minor) Test code cleanup 2025-04-02 13:16:01 +02:00
Viktor Lofgren
d1ec909b36 (crawler) Improve handling of timeouts to prevent crawler from getting stuck 2025-04-02 12:57:21 +02:00
Viktor Lofgren
c67c5bbf42 (crawler) Experimentally drop to HTTP 1.1 for crawler to see if this solves stuck send()s 2025-04-01 12:05:21 +02:00
Viktor Lofgren
ecb0e57a1a (crawler) Make the use of virtual threads in the crawler configurable via system properties 2025-03-27 21:26:05 +01:00
Viktor Lofgren
8c61f61b46 (crawler) Add crawling metadata to domainstate db 2025-03-27 16:38:37 +01:00
Viktor Lofgren
662a18c933 Revert "(crawler) Further rearrange crawl order"
This reverts commit 1c2426a052.

The change does not appear necessary to avoid problems.
2025-03-27 11:25:08 +01:00
Viktor Lofgren
1c2426a052 (crawler) Further rearrange crawl order
Limit crawl order preferrence to edu domains, to avoid hitting stuff like medium and wordpress with shotgun requests.
2025-03-27 11:19:20 +01:00
Viktor Lofgren
34df7441ac (crawler) Add some jitter to crawl delay to avoid accidentally synchronized requests 2025-03-27 11:15:16 +01:00
Viktor Lofgren
5387e2bd80 (crawler) Adjust crawl order to get a better mixture of domains 2025-03-27 11:12:48 +01:00
Viktor Lofgren
0f3b24d0f8 (crawler) Evaluate virtual threads for the crawler
The change also alters SimpleBlockingThreadPool to add the option to use virtual threads instead of platform threads.
2025-03-27 11:02:21 +01:00
Viktor Lofgren
a732095d2a (crawler) Improve crawl task ordering
Further improve the ordering of the crawl tasks in order to ensure that potentially blocking tasks are enqueued as soon as possible.
2025-03-26 16:51:37 +01:00
Viktor Lofgren
6607f0112f (crawler) Improve how the crawler deals with interruptions
In some cases, it threads would previously fail to terminate when interrupted.
2025-03-26 16:19:57 +01:00
Viktor Lofgren
4913730de9 (jdk) Upgrade to Java 24 2025-03-26 13:26:06 +01:00
Viktor Lofgren
1db64f9d56 (chore) Fix zookeeper test by upgrading zk image version.
Test suddenly broke due to the increasing entropy of the universe.
2025-03-26 11:47:14 +01:00
Viktor Lofgren
4dcff14498 (search) Improve contrast with light mode 2025-03-25 13:15:31 +01:00
Viktor Lofgren
426658f64e (search) Improve contrast with light mode 2025-03-25 11:54:54 +01:00
Viktor Lofgren
2181b22f05 (crawler) Change default maxConcurrentRequests to 512
This seems like a more sensible default after testing a bit.  May need local tuning.
2025-03-22 12:11:09 +01:00
Viktor Lofgren
42bd79a609 (crawler) Experimentally throttle the number of active retrievals to see how this affects the network performance
There's been some indications that request storms lead to buffer bloat and bad throughput.

This adds a configurable semaphore, by default permitting 100 active requests.
2025-03-22 11:50:37 +01:00
Viktor Lofgren
b91c1e528a (favicon) Send dummy svg result when image is missing
This prevents the browser from rendering a "broken image" in this scenario.
2025-03-21 15:15:14 +01:00
Viktor Lofgren
b1130d7a04 (domainstatedb) Allow creation of disconnected db
This is required for executor services that do not have crawl data to still be able to initialize.
2025-03-21 14:59:36 +01:00
Viktor Lofgren
8364bcdc97 (favicon) Add favicons to the matchograms 2025-03-21 14:30:40 +01:00
37 changed files with 559 additions and 167 deletions

View File

@@ -43,12 +43,11 @@ subprojects.forEach {it ->
} }
ext { ext {
jvmVersion=23 jvmVersion = 24
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23' dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
dockerImageTag='latest' dockerImageTag='latest'
dockerImageRegistry='marginalia' dockerImageRegistry='marginalia'
jibVersion = '3.4.4' jibVersion = '3.4.4'
} }
idea { idea {

View File

@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
@Nonnull @Nonnull
public final String topDomain; public final String topDomain;
public EdgeDomain(String host) { public EdgeDomain(@Nonnull String host) {
Objects.requireNonNull(host, "domain name must not be null"); Objects.requireNonNull(host, "domain name must not be null");
host = host.toLowerCase(); host = host.toLowerCase();
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
this.topDomain = topDomain; this.topDomain = topDomain;
} }
public static String getTopDomain(String host) {
return new EdgeDomain(host).topDomain;
}
private boolean looksLikeGovTld(String host) { private boolean looksLikeGovTld(String host) {
if (host.length() < 8) if (host.length() < 8)
return false; return false;
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
return topDomain.substring(0, cutPoint).toLowerCase(); return topDomain.substring(0, cutPoint).toLowerCase();
} }
public String getLongDomainKey() {
StringBuilder ret = new StringBuilder();
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
ret.append(topDomain);
} else {
ret.append(topDomain, 0, cutPoint);
}
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
ret.append(":");
ret.append(subDomain);
}
return ret.toString().toLowerCase();
}
/** If possible, try to provide an alias domain, /** If possible, try to provide an alias domain,
* i.e. a domain name that is very likely to link to this one * i.e. a domain name that is very likely to link to this one
* */ * */

View File

@@ -35,21 +35,8 @@ public class RateLimiter {
} }
public static RateLimiter forExpensiveRequest() {
return new RateLimiter(5, 10);
}
public static RateLimiter custom(int perMinute) { public static RateLimiter custom(int perMinute) {
return new RateLimiter(perMinute, 60); return new RateLimiter(4 * perMinute, perMinute);
}
public static RateLimiter forSpamBots() {
return new RateLimiter(120, 3600);
}
public static RateLimiter forLogin() {
return new RateLimiter(3, 15);
} }
private void cleanIdleBuckets() { private void cleanIdleBuckets() {
@@ -62,7 +49,7 @@ public class RateLimiter {
} }
private Bucket createBucket() { private Bucket createBucket() {
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate)); var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
var bw = Bandwidth.classic(capacity, refill); var bw = Bandwidth.classic(capacity, refill);
return Bucket.builder().addLimit(bw).build(); return Bucket.builder().addLimit(bw).build();
} }

View File

@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
class ZkServiceRegistryTest { class ZkServiceRegistryTest {
private static final int ZOOKEEPER_PORT = 2181; private static final int ZOOKEEPER_PORT = 2181;
private static final GenericContainer<?> zookeeper = private static final GenericContainer<?> zookeeper =
new GenericContainer<>("zookeeper:3.8.0") new GenericContainer<>("zookeeper:3.8")
.withExposedPorts(ZOOKEEPER_PORT); .withExposedPorts(ZOOKEEPER_PORT);
List<ZkServiceRegistry> registries = new ArrayList<>(); List<ZkServiceRegistry> registries = new ArrayList<>();

View File

@@ -21,6 +21,10 @@ public class FaviconGrpcService extends FaviconAPIGrpc.FaviconAPIImplBase implem
this.domainStateDb = domainStateDb; this.domainStateDb = domainStateDb;
} }
public boolean shouldRegisterService() {
return domainStateDb.isAvailable();
}
@Override @Override
public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) { public void getFavicon(RpcFaviconRequest request, StreamObserver<RpcFaviconResponse> responseObserver) {
Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain()); Optional<DomainStateDb.FaviconRecord> icon = domainStateDb.getIcon(request.getDomain());

View File

@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class); private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) { public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
this(name, poolSize, queueSize, ThreadType.PLATFORM);
}
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
tasks = new ArrayBlockingQueue<>(queueSize); tasks = new ArrayBlockingQueue<>(queueSize);
for (int i = 0; i < poolSize; i++) { for (int i = 0; i < poolSize; i++) {
Thread worker = new Thread(this::worker, name + "[" + i + "]");
worker.setDaemon(true); Thread.Builder threadBuilder = switch (threadType) {
worker.start(); case VIRTUAL -> Thread.ofVirtual();
case PLATFORM -> Thread.ofPlatform().daemon(true);
};
Thread worker = threadBuilder
.name(name + "[" + i + "]")
.start(this::worker);
workers.add(worker); workers.add(worker);
} }
} }
public enum ThreadType {
VIRTUAL,
PLATFORM
}
public void submit(Task task) throws InterruptedException { public void submit(Task task) throws InterruptedException {
tasks.put(task); tasks.put(task);
} }

View File

@@ -41,10 +41,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption; import java.nio.file.StandardCopyOption;
import java.security.Security; import java.security.Security;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@@ -106,9 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
this.blacklist = blacklist; this.blacklist = blacklist;
this.node = processConfiguration.node(); this.node = processConfiguration.node();
SimpleBlockingThreadPool.ThreadType threadType;
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
}
else {
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
}
pool = new SimpleBlockingThreadPool("CrawlerPool", pool = new SimpleBlockingThreadPool("CrawlerPool",
Integer.getInteger("crawler.poolSize", 256), Integer.getInteger("crawler.poolSize", 256),
1); 1,
threadType);
// Wait for the blacklist to be loaded before starting the crawl // Wait for the blacklist to be loaded before starting the crawl
@@ -224,10 +230,7 @@ public class CrawlerMain extends ProcessMainClass {
logger.info("Loaded {} domains", crawlSpecRecords.size()); logger.info("Loaded {} domains", crawlSpecRecords.size());
// Shuffle the domains to ensure we get a good mix of domains in each crawl, crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
// so that e.g. the big domains don't get all crawled at once, or we end up
// crawling the same server in parallel from different subdomains...
Collections.shuffle(crawlSpecRecords);
// First a validation run to ensure the file is all good to parse // First a validation run to ensure the file is all good to parse
if (crawlSpecRecords.isEmpty()) { if (crawlSpecRecords.isEmpty()) {
@@ -306,6 +309,30 @@ public class CrawlerMain extends ProcessMainClass {
} }
} }
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
* we want to enqueue domains that have common top domains first, but otherwise have a random
* order.
* <p></p>
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
* hashcode based on the fields).
* */
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
Random r = new Random();
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
Map<String, Integer> randomOrder = new HashMap<>(records.size());
for (var spec : records) {
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
randomOrder.put(spec.domain, r.nextInt());
}
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
.reversed()
.thenComparing(spec -> randomOrder.get(spec.domain))
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
}
/** Submit a task for execution if it can be run, returns true if it was submitted /** Submit a task for execution if it can be run, returns true if it was submitted
* or if it can be discarded */ * or if it can be discarded */
private boolean trySubmitDeferredTask(CrawlTask task) { private boolean trySubmitDeferredTask(CrawlTask task) {
@@ -474,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
return new CrawlDataReference(slopPath); return new CrawlDataReference(slopPath);
} }
} catch (IOException e) { } catch (Exception e) {
logger.debug("Failed to read previous crawl data for {}", specification.domain()); logger.debug("Failed to read previous crawl data for {}", specification.domain());
} }

View File

@@ -11,6 +11,7 @@ import java.nio.file.Path;
import java.sql.Connection; import java.sql.Connection;
import java.sql.DriverManager; import java.sql.DriverManager;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant; import java.time.Instant;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
private final Connection connection; private final Connection connection;
public record CrawlMeta(
String domainName,
Instant lastFullCrawl,
Duration recrawlTime,
Duration crawlTime,
int recrawlErrors,
int crawlChanges,
int totalCrawlSize
) {}
public record SummaryRecord( public record SummaryRecord(
String domainName, String domainName,
Instant lastUpdated, Instant lastUpdated,
@@ -79,11 +91,16 @@ public class DomainStateDb implements AutoCloseable {
return fs.asPath().resolve("domainstate.db"); return fs.asPath().resolve("domainstate.db");
} }
else { else {
throw new SQLException("Could not find crawl data storage"); return null;
} }
} }
public DomainStateDb(Path filename) throws SQLException { public DomainStateDb(@Nullable Path filename) throws SQLException {
if (null == filename) {
connection = null;
return;
}
String sqliteDbString = "jdbc:sqlite:" + filename.toString(); String sqliteDbString = "jdbc:sqlite:" + filename.toString();
connection = DriverManager.getConnection(sqliteDbString); connection = DriverManager.getConnection(sqliteDbString);
@@ -97,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
feedUrl TEXT feedUrl TEXT
) )
"""); """);
stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS crawl_meta (
domain TEXT PRIMARY KEY,
lastFullCrawlEpochMs LONG NOT NULL,
recrawlTimeMs LONG NOT NULL,
recrawlErrors INTEGER NOT NULL,
crawlTimeMs LONG NOT NULL,
crawlChanges INTEGER NOT NULL,
totalCrawlSize INTEGER NOT NULL
)
""");
stmt.executeUpdate(""" stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS favicon ( CREATE TABLE IF NOT EXISTS favicon (
domain TEXT PRIMARY KEY, domain TEXT PRIMARY KEY,
@@ -110,11 +138,18 @@ public class DomainStateDb implements AutoCloseable {
@Override @Override
public void close() throws SQLException { public void close() throws SQLException {
connection.close(); if (connection != null) {
connection.close();
}
} }
public boolean isAvailable() {
return connection != null;
}
public void saveIcon(String domain, FaviconRecord faviconRecord) { public void saveIcon(String domain, FaviconRecord faviconRecord) {
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
try (var stmt = connection.prepareStatement(""" try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INTO favicon (domain, contentType, icon) INSERT OR REPLACE INTO favicon (domain, contentType, icon)
VALUES(?, ?, ?) VALUES(?, ?, ?)
@@ -130,6 +165,9 @@ public class DomainStateDb implements AutoCloseable {
} }
public Optional<FaviconRecord> getIcon(String domain) { public Optional<FaviconRecord> getIcon(String domain) {
if (connection == null)
return Optional.empty();
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) { try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
stmt.setString(1, domain); stmt.setString(1, domain);
var rs = stmt.executeQuery(); var rs = stmt.executeQuery();
@@ -149,7 +187,29 @@ public class DomainStateDb implements AutoCloseable {
return Optional.empty(); return Optional.empty();
} }
public void save(CrawlMeta crawlMeta) {
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
VALUES (?, ?, ?, ?, ?, ?, ?)
""")) {
stmt.setString(1, crawlMeta.domainName());
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
stmt.setInt(4, crawlMeta.recrawlErrors);
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
stmt.setInt(6, crawlMeta.crawlChanges);
stmt.setInt(7, crawlMeta.totalCrawlSize);
stmt.executeUpdate();
} catch (SQLException e) {
logger.error("Failed to insert crawl meta record", e);
}
}
public void save(SummaryRecord record) { public void save(SummaryRecord record) {
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
try (var stmt = connection.prepareStatement(""" try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl) INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
VALUES (?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?)
@@ -165,7 +225,38 @@ public class DomainStateDb implements AutoCloseable {
} }
} }
public Optional<SummaryRecord> get(String domainName) { public Optional<CrawlMeta> getMeta(String domainName) {
if (connection == null)
return Optional.empty();
try (var stmt = connection.prepareStatement("""
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
FROM crawl_meta
WHERE domain = ?
""")) {
stmt.setString(1, domainName);
var rs = stmt.executeQuery();
if (rs.next()) {
return Optional.of(new CrawlMeta(
rs.getString("domain"),
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
Duration.ofMillis(rs.getLong("crawlTimeMs")),
rs.getInt("recrawlErrors"),
rs.getInt("crawlChanges"),
rs.getInt("totalCrawlSize")
));
}
} catch (SQLException ex) {
logger.error("Failed to get crawl meta record", ex);
}
return Optional.empty();
}
public Optional<SummaryRecord> getSummary(String domainName) {
if (connection == null)
return Optional.empty();
try (var stmt = connection.prepareStatement(""" try (var stmt = connection.prepareStatement("""
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
FROM summary FROM summary

View File

@@ -29,7 +29,9 @@ import java.net.http.HttpResponse;
import java.net.http.HttpTimeoutException; import java.net.http.HttpTimeoutException;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
@@ -55,12 +57,22 @@ public class HttpFetcherImpl implements HttpFetcher {
private final HttpClient client; private final HttpClient client;
private HttpClient createClient() { private HttpClient createClient() {
final ExecutorService executorService;
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
executorService = Executors.newVirtualThreadPerTaskExecutor();
}
else {
executorService = Executors.newCachedThreadPool();
}
return HttpClient.newBuilder() return HttpClient.newBuilder()
.sslContext(NoSecuritySSL.buildSslContext()) .sslContext(NoSecuritySSL.buildSslContext())
.cookieHandler(cookies) .cookieHandler(cookies)
.followRedirects(HttpClient.Redirect.NORMAL) .followRedirects(HttpClient.Redirect.NORMAL)
.version(HttpClient.Version.HTTP_1_1)
.connectTimeout(Duration.ofSeconds(8)) .connectTimeout(Duration.ofSeconds(8))
.executor(Executors.newCachedThreadPool()) .executor(executorService)
.build(); .build();
} }
@@ -116,7 +128,7 @@ public class HttpFetcherImpl implements HttpFetcher {
for (int tries = 0;; tries++) { for (int tries = 0;; tries++) {
try { try {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding()); var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri()); EdgeUrl rspUri = new EdgeUrl(rsp.uri());
if (!Objects.equals(rspUri.domain, url.domain)) { if (!Objects.equals(rspUri.domain, url.domain)) {
@@ -153,7 +165,7 @@ public class HttpFetcherImpl implements HttpFetcher {
.timeout(requestTimeout) .timeout(requestTimeout)
; ;
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding()); var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
var headers = rsp.headers(); var headers = rsp.headers();
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null); var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
@@ -229,21 +241,24 @@ public class HttpFetcherImpl implements HttpFetcher {
contentTags.paint(getBuilder); contentTags.paint(getBuilder);
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); try (var sl = new SendLock()) {
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 429) { if (ok.statusCode() == 429) {
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1")); throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
} }
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw(); return new HttpFetchResult.Result304Raw();
} }
if (ok.statusCode() == 200) { if (ok.statusCode() == 200) {
return ok; return ok;
}
} }
return result;
} }
return result;
} }
@Override @Override
@@ -317,22 +332,28 @@ public class HttpFetcherImpl implements HttpFetcher {
.timeout(requestTimeout) .timeout(requestTimeout)
.build(); .build();
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream()); try (var sl = new SendLock()) {
if (response.statusCode() != 200) { var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
return new SitemapResult.SitemapError(); if (response.statusCode() != 200) {
} return new SitemapResult.SitemapError();
}
try (InputStream inputStream = response.body()) {
Document parsedSitemap;
InputStream parserStream;
if (sitemapUrl.path.endsWith(".gz")) { try (InputStream inputStream = response.body()) {
parserStream = new GZIPInputStream(inputStream); InputStream parserStream;
} if (sitemapUrl.path.endsWith(".gz")) {
else { parserStream = new GZIPInputStream(inputStream);
parserStream = inputStream; } else {
parserStream = inputStream;
}
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
}
finally {
sl.close();
} }
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
if (parsedSitemap.childrenSize() == 0) { if (parsedSitemap.childrenSize() == 0) {
return new SitemapResult.SitemapError(); return new SitemapResult.SitemapError();
} }
@@ -386,7 +407,7 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) { private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try { try (var sl = new SendLock()) {
var getRequest = HttpRequest.newBuilder() var getRequest = HttpRequest.newBuilder()
.GET() .GET()
.uri(url.asURI()) .uri(url.asURI())
@@ -429,5 +450,30 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
} }
} }
}
class SendLock implements AutoCloseable {
private static final Semaphore maxConcurrentRequests = new Semaphore(Integer.getInteger("crawler.maxConcurrentRequests", 512));
boolean closed = false;
public SendLock() {
maxConcurrentRequests.acquireUninterruptibly();
}
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
try (var lock = new SendLock()) {
return client.send(request, handler);
}
}
@Override
public void close() {
if (!closed) {
maxConcurrentRequests.release();
closed = true;
}
}
} }

View File

@@ -8,7 +8,10 @@ import java.net.http.HttpHeaders;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.Map; import java.util.Map;
import java.util.concurrent.*;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
/** Input buffer for temporary storage of a HTTP response /** Input buffer for temporary storage of a HTTP response
@@ -39,7 +42,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers. * and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status. * If an error occurs, a buffer will be created with no content and an error status.
*/ */
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) { static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp, Duration timeLimit) {
if (rsp == null) if (rsp == null)
return new ErrorBuffer(); return new ErrorBuffer();
@@ -51,11 +54,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) { if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
// If the content is small and not compressed, we can just read it into memory // If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(headers, is, contentLength); return new MemoryBuffer(headers, timeLimit, is, contentLength);
} }
else { else {
// Otherwise, we unpack it into a file and read it from there // Otherwise, we unpack it into a file and read it from there
return new FileBuffer(headers, is); return new FileBuffer(headers, timeLimit, is);
} }
} }
catch (Exception ex) { catch (Exception ex) {
@@ -64,9 +67,16 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
private static final ExecutorService virtualExecutorService = Executors.newVirtualThreadPerTaskExecutor();
private Future<Integer> readAsync(InputStream is, byte[] out) {
return virtualExecutorService.submit(() -> is.read(out));
}
/** Copy an input stream to an output stream, with a maximum size and time limit */ /** Copy an input stream to an output stream, with a maximum size and time limit */
protected void copy(InputStream is, OutputStream os) { protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
long startTime = System.currentTimeMillis(); Instant start = Instant.now();
Instant timeout = start.plus(timeLimit);
long size = 0; long size = 0;
byte[] buffer = new byte[8192]; byte[] buffer = new byte[8192];
@@ -76,7 +86,15 @@ public abstract class WarcInputBuffer implements AutoCloseable {
while (true) { while (true) {
try { try {
int n = is.read(buffer); Duration remaining = Duration.between(Instant.now(), timeout);
if (remaining.isNegative()) {
truncationReason = WarcTruncationReason.TIME;
break;
}
Future<Integer> readAsync = readAsync(is, buffer);
int n = readAsync.get(remaining.toMillis(), TimeUnit.MILLISECONDS);
if (n < 0) break; if (n < 0) break;
size += n; size += n;
os.write(buffer, 0, n); os.write(buffer, 0, n);
@@ -85,12 +103,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
truncationReason = WarcTruncationReason.LENGTH; truncationReason = WarcTruncationReason.LENGTH;
break; break;
} }
} catch (IOException|ExecutionException e) {
if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) { truncationReason = WarcTruncationReason.UNSPECIFIED;
truncationReason = WarcTruncationReason.TIME; } catch (TimeoutException e) {
break; truncationReason = WarcTruncationReason.TIME;
} } catch (InterruptedException e) {
} catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@@ -123,12 +140,12 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */ /** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer { class MemoryBuffer extends WarcInputBuffer {
byte[] data; byte[] data;
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) { public MemoryBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream, int size) {
super(headers); super(headers);
var outputStream = new ByteArrayOutputStream(size); var outputStream = new ByteArrayOutputStream(size);
copy(responseStream, outputStream); copy(responseStream, outputStream, timeLimit);
data = outputStream.toByteArray(); data = outputStream.toByteArray();
} }
@@ -152,7 +169,7 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer { class FileBuffer extends WarcInputBuffer {
private final Path tempFile; private final Path tempFile;
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException { public FileBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers)); super(suppressContentEncoding(headers));
this.tempFile = Files.createTempFile("rsp", ".html"); this.tempFile = Files.createTempFile("rsp", ".html");
@@ -160,7 +177,7 @@ class FileBuffer extends WarcInputBuffer {
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) { if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
try (var out = Files.newOutputStream(tempFile)) { try (var out = Files.newOutputStream(tempFile)) {
copy(new GZIPInputStream(responseStream), out); copy(new GZIPInputStream(responseStream), out, timeLimit);
} }
catch (Exception ex) { catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED; truncationReason = WarcTruncationReason.UNSPECIFIED;
@@ -168,7 +185,7 @@ class FileBuffer extends WarcInputBuffer {
} }
else { else {
try (var out = Files.newOutputStream(tempFile)) { try (var out = Files.newOutputStream(tempFile)) {
copy(responseStream, out); copy(responseStream, out, timeLimit);
} }
catch (Exception ex) { catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED; truncationReason = WarcTruncationReason.UNSPECIFIED;

View File

@@ -102,7 +102,7 @@ public class WarcRecorder implements AutoCloseable {
} }
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response); try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request.timeout().orElseGet(() -> Duration.ofMillis(MAX_TIME)));
InputStream inputStream = inputBuffer.read()) InputStream inputStream = inputBuffer.read())
{ {
if (cookies.hasCookies()) { if (cookies.hasCookies()) {

View File

@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import java.time.Duration; import java.time.Duration;
import java.util.concurrent.ThreadLocalRandom;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
@@ -53,12 +54,13 @@ public class CrawlDelayTimer {
public void waitFetchDelay(long spentTime) { public void waitFetchDelay(long spentTime) {
long sleepTime = delayTime; long sleepTime = delayTime;
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
try { try {
if (sleepTime >= 1) { if (sleepTime >= 1) {
if (spentTime > sleepTime) if (spentTime > sleepTime)
return; return;
Thread.sleep(min(sleepTime - spentTime, 5000)); Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
} else { } else {
// When no crawl delay is specified, lean toward twice the fetch+process time, // When no crawl delay is specified, lean toward twice the fetch+process time,
// within sane limits. This means slower servers get slower crawling, and faster // within sane limits. This means slower servers get slower crawling, and faster
@@ -71,17 +73,17 @@ public class CrawlDelayTimer {
if (spentTime > sleepTime) if (spentTime > sleepTime)
return; return;
Thread.sleep(sleepTime - spentTime); Thread.sleep(sleepTime - spentTime + jitter);
} }
if (slowDown) { if (slowDown) {
// Additional delay when the server is signalling it wants slower requests // Additional delay when the server is signalling it wants slower requests
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS); Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
} }
} }
catch (InterruptedException e) { catch (InterruptedException e) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
throw new RuntimeException(); throw new RuntimeException("Interrupted", e);
} }
} }
} }

View File

@@ -26,6 +26,8 @@ import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -108,15 +110,24 @@ public class CrawlerRetreiver implements AutoCloseable {
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer); DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
domainStateDb.save(summaryRecord); domainStateDb.save(summaryRecord);
if (Thread.interrupted()) {
// There's a small chance we're interrupted during the sniffing portion
throw new InterruptedException();
}
Instant recrawlStart = Instant.now();
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) { if (recrawlMetadata.size() > 0) {
// If we have reference data, we will always grow the crawl depth a bit // If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5, 2500); crawlFrontier.increaseDepth(1.5, 2500);
} }
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks); yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
} }
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> { case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString())); domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
@@ -138,17 +149,29 @@ public class CrawlerRetreiver implements AutoCloseable {
private int crawlDomain(EdgeUrl rootUrl, private int crawlDomain(EdgeUrl rootUrl,
SimpleRobotRules robotsRules, SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer, CrawlDelayTimer delayTimer,
DomainLinks domainLinks) { DomainLinks domainLinks,
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
Duration recrawlTime) {
Instant crawlStart = Instant.now();
// Add external links to the crawl frontier // Add external links to the crawl frontier
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto)); crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
// Fetch sitemaps // Fetch sitemaps
for (var sitemap : robotsRules.getSitemaps()) { for (var sitemap : robotsRules.getSitemaps()) {
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
// Validate the sitemap URL and check if it belongs to the domain as the root URL
if (EdgeUrl.parse(sitemap)
.map(url -> url.getDomain().equals(rootUrl.domain))
.orElse(false)) {
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
}
} }
int crawlerAdditions = 0;
while (!crawlFrontier.isEmpty() while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached() && !crawlFrontier.isCrawlDepthReached()
&& errorCount < MAX_ERRORS && errorCount < MAX_ERRORS
@@ -180,7 +203,11 @@ public class CrawlerRetreiver implements AutoCloseable {
continue; continue;
try { try {
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()); var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
if (result.isOk()) {
crawlerAdditions++;
}
} }
catch (InterruptedException ex) { catch (InterruptedException ex) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
@@ -188,6 +215,17 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
} }
Duration crawlTime = Duration.between(crawlStart, Instant.now());
domainStateDb.save(new DomainStateDb.CrawlMeta(
domain,
Instant.now(),
recrawlTime,
crawlTime,
recrawlMetadata.errors(),
crawlerAdditions,
recrawlMetadata.size() + crawlerAdditions
));
return crawlFrontier.visitedSize(); return crawlFrontier.visitedSize();
} }
@@ -289,6 +327,10 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error configuring link filter", ex); logger.error("Error configuring link filter", ex);
if (Thread.interrupted()) {
Thread.currentThread().interrupt();
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
}
} }
finally { finally {
crawlFrontier.addVisited(rootUrl); crawlFrontier.addVisited(rootUrl);
@@ -316,7 +358,7 @@ public class CrawlerRetreiver implements AutoCloseable {
); );
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException { private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
var oldDomainStateRecord = domainStateDb.get(domain); var oldDomainStateRecord = domainStateDb.getSummary(domain);
// If we are already aware of an old feed URL, then we can just revalidate it // If we are already aware of an old feed URL, then we can just revalidate it
if (oldDomainStateRecord.isPresent()) { if (oldDomainStateRecord.isPresent()) {

View File

@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
} }
/** Performs a re-crawl of old documents, comparing etags and last-modified */ /** Performs a re-crawl of old documents, comparing etags and last-modified */
public int recrawl(CrawlDataReference oldCrawlData, public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
SimpleRobotRules robotsRules, SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer) CrawlDelayTimer delayTimer)
throws InterruptedException { throws InterruptedException {
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
int retained = 0; int retained = 0;
int errors = 0; int errors = 0;
int skipped = 0; int skipped = 0;
int size = 0;
for (CrawledDocument doc : oldCrawlData) { for (CrawledDocument doc : oldCrawlData) {
if (errors > 20) { if (errors > 20) {
@@ -46,6 +47,10 @@ public class CrawlerRevisitor {
break; break;
} }
if (Thread.interrupted()) {
throw new InterruptedException();
}
var urlMaybe = EdgeUrl.parse(doc.url); var urlMaybe = EdgeUrl.parse(doc.url);
if (urlMaybe.isEmpty()) if (urlMaybe.isEmpty())
continue; continue;
@@ -78,6 +83,7 @@ public class CrawlerRevisitor {
continue; continue;
} }
size++;
double skipProb; double skipProb;
@@ -150,6 +156,8 @@ public class CrawlerRevisitor {
} }
} }
return recrawled; return new RecrawlMetadata(size, errors, skipped);
} }
public record RecrawlMetadata(int size, int errors, int skipped) {}
} }

View File

@@ -8,6 +8,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant; import java.time.Instant;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
@@ -47,8 +48,8 @@ class DomainStateDbTest {
db.save(allFields); db.save(allFields);
db.save(minFields); db.save(minFields);
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow()); assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow()); assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
var updatedAllFields = new DomainStateDb.SummaryRecord( var updatedAllFields = new DomainStateDb.SummaryRecord(
"all.marginalia.nu", "all.marginalia.nu",
@@ -59,7 +60,19 @@ class DomainStateDbTest {
); );
db.save(updatedAllFields); db.save(updatedAllFields);
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow()); assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
}
}
@Test
public void testMetadata() throws SQLException {
try (var db = new DomainStateDb(tempFile)) {
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
db.save(original);
var maybeMeta = db.getMeta("example.com");
assertTrue(maybeMeta.isPresent());
assertEquals(original, maybeMeta.get());
} }
} }

View File

@@ -0,0 +1,152 @@
package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest;
import org.netpreserve.jwarc.WarcResponse;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@Tag("slow")
class WarcRecorderFakeServerTest {
static HttpServer server;
@BeforeAll
public static void setUpAll() throws IOException {
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
// This endpoint will finish sending the response immediately
server.createContext("/fast", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
try (var os = exchange.getResponseBody()) {
os.write("<html><body>hello</body></html>".getBytes());
os.flush();
}
exchange.close();
});
// This endpoint will take 10 seconds to finish sending the response,
// which should trigger a timeout in the client
server.createContext("/slow", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
try (var os = exchange.getResponseBody()) {
os.write("<html><body>hello</body></html>".getBytes());
os.flush();
try {
TimeUnit.SECONDS.sleep(10);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
os.write(":D".getBytes());
os.flush();
}
exchange.close();
});
server.start();
}
@AfterAll
public static void tearDownAll() {
server.stop(0);
}
Path fileNameWarc;
Path fileNameParquet;
WarcRecorder client;
HttpClient httpClient;
@BeforeEach
public void setUp() throws Exception {
httpClient = HttpClient.newBuilder().build();
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
client = new WarcRecorder(fileNameWarc, new Cookies());
}
@AfterEach
public void tearDown() throws Exception {
client.close();
Files.delete(fileNameWarc);
}
@Test
public void fetchFast() throws Exception {
client.fetch(httpClient,
HttpRequest.newBuilder()
.uri(new java.net.URI("http://localhost:14510/fast"))
.timeout(Duration.ofSeconds(1))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build()
);
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
warcReader.forEach(record -> {
if (record instanceof WarcRequest req) {
sampleData.put(record.type(), req.target());
}
if (record instanceof WarcResponse rsp) {
sampleData.put(record.type(), rsp.target());
}
});
}
System.out.println(sampleData);
}
@Test
public void fetchSlow() throws Exception {
Instant start = Instant.now();
client.fetch(httpClient,
HttpRequest.newBuilder()
.uri(new java.net.URI("http://localhost:14510/slow"))
.timeout(Duration.ofSeconds(1))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build()
);
Instant end = Instant.now();
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
warcReader.forEach(record -> {
if (record instanceof WarcRequest req) {
sampleData.put(record.type(), req.target());
}
if (record instanceof WarcResponse rsp) {
sampleData.put(record.type(), rsp.target());
System.out.println(rsp.target());
}
});
}
System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
// so we expect the request to take 1s and change before it times out.
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000);
}
}

View File

@@ -82,17 +82,17 @@ public class SearchService extends JoobyService {
jooby.get("/site/https://*", this::handleSiteUrlRedirect); jooby.get("/site/https://*", this::handleSiteUrlRedirect);
jooby.get("/site/http://*", this::handleSiteUrlRedirect); jooby.get("/site/http://*", this::handleSiteUrlRedirect);
String emptySvg = "<svg xmlns=\"http://www.w3.org/2000/svg\"></svg>";
jooby.get("/site/{domain}/favicon", ctx -> { jooby.get("/site/{domain}/favicon", ctx -> {
String domain = ctx.path("domain").value(); String domain = ctx.path("domain").value();
logger.info("Finding icon for domain {}", domain); logger.info("Finding icon for domain {}", domain);
domainQueries.getDomainId(new EdgeDomain(domain));
try { try {
DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain)); DbDomainQueries.DomainIdWithNode domainIdWithNode = domainQueries.getDomainIdWithNode(new EdgeDomain(domain));
var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity()); var faviconMaybe = faviconClient.getFavicon(domain, domainIdWithNode.nodeAffinity());
if (faviconMaybe.isEmpty()) { if (faviconMaybe.isEmpty()) {
ctx.setResponseCode(404); ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
return ""; return emptySvg;
} else { } else {
var favicon = faviconMaybe.get(); var favicon = faviconMaybe.get();
@@ -102,7 +102,8 @@ public class SearchService extends JoobyService {
} }
} }
catch (NoSuchElementException ex) { catch (NoSuchElementException ex) {
ctx.setResponseCode(404); ctx.setResponseType(MediaType.valueOf("image/svg+xml"));
return emptySvg;
} }
return ""; return "";
}); });

View File

@@ -10,7 +10,7 @@
@template.part.head(title = "Marginalia Search - Explore") @template.part.head(title = "Marginalia Search - Explore")
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans "> <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans ">
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
@@ -23,7 +23,7 @@
</header> </header>
<div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center"> <div class="max-w-[1400px] mx-auto flex flex-col gap-1 place-items-center">
<div class="border dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4"> <div class="border border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 dark:text-gray-100 my-4 p-3 rounded overflow-hidden flex flex-col space-y-4">
@if (results.hasFocusDomain()) @if (results.hasFocusDomain())
<div class="flex space-x-1"> <div class="flex space-x-1">
<span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span> <span>Showing websites similar to <a class="font-mono text-liteblue dark:text-blue-200" href="/site/${results.focusDomain()}"><i class="fas fa-globe"></i> <span class="underline">${results.focusDomain()}</span></a></span>
@@ -36,7 +36,7 @@
</div> </div>
<div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4"> <div class="grid-cols-1 gap-4 sm:grid sm:grid-cols-1 md:grid-cols-3 xl:grid-cols-4 mx-auto sm:p-4">
@for (BrowseResult result : results.results()) @for (BrowseResult result : results.results())
<div class="bg-white border dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden"> <div class="bg-white border border-gray-300 dark:border-gray-600 dark:bg-gray-800 rounded overflow-hidden">
<div class="bg-margeblue text-white p-2 flex space-x-4 text-sm"> <div class="bg-margeblue text-white p-2 flex space-x-4 text-sm">
<span class="break-words">${result.displayDomain()}</span> <span class="break-words">${result.displayDomain()}</span>
<div class="grow"></div> <div class="grow"></div>

View File

@@ -9,6 +9,7 @@
nicotine: '#f8f8ee', nicotine: '#f8f8ee',
margeblue: '#3e5f6f', margeblue: '#3e5f6f',
liteblue: '#0066cc', liteblue: '#0066cc',
bgblue: '#e5e9eb',
}, },
screens: { screens: {
'coarsepointer': { 'coarsepointer': {

View File

@@ -15,7 +15,7 @@
@template.part.head(title = "Marginalia Search - " + parameters.query()) @template.part.head(title = "Marginalia Search - " + parameters.query())
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
<div> <div>

View File

@@ -9,7 +9,7 @@
@template.part.head(title = "Marginalia Search - Error") @template.part.head(title = "Marginalia Search - Error")
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)

View File

@@ -11,7 +11,7 @@
@template.part.head(title = "Marginalia Search - " + results.getQuery()) @template.part.head(title = "Marginalia Search - " + results.getQuery())
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
<div> <div>

View File

@@ -26,15 +26,15 @@
It operates a bit like a clock, starting at the top and working its way around clockwise.</p> It operates a bit like a clock, starting at the top and working its way around clockwise.</p>
<div class="flex gap-4 place-items-middle"> <div class="flex gap-4 place-items-middle">
@template.serp.part.matchogram(mask = 90) @template.serp.part.matchogram(mask = 90, domain = "example.com")
<div>This is by the beginning</div> <div>This is by the beginning</div>
</div> </div>
<div class="flex gap-4 place-items-middle"> <div class="flex gap-4 place-items-middle">
@template.serp.part.matchogram(mask = 90L<<26) @template.serp.part.matchogram(mask = 90L<<26, domain = "example.com")
<div>This is in the middle</div> <div>This is in the middle</div>
</div> </div>
<div class="flex gap-4 place-items-middle"> <div class="flex gap-4 place-items-middle">
@template.serp.part.matchogram(mask = 5L<<48) @template.serp.part.matchogram(mask = 5L<<48, domain = "example.com")
<div>This is toward the end</div> <div>This is toward the end</div>
</div> </div>

View File

@@ -1,11 +1,13 @@
@import java.util.stream.IntStream @import java.util.stream.IntStream
@param long mask @param long mask
@param String domain
<svg width="40" height="40"> <svg width="40" height="40"
style="background-image: url('/site/${domain}/favicon'); background-repeat: no-repeat; background-size: 16px 16px; background-position: center; ">
<circle <circle
cx="18" cx="20"
cy="18" cy="20"
r="16" r="16"
fill="none" fill="none"
stroke="#eee" stroke="#eee"
@@ -13,10 +15,10 @@
/> />
@for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray()) @for (int bit : IntStream.range(0, 56).filter(bit -> (mask & (1L << bit)) != 0).toArray())
<line <line
x1="${18 + 15*Math.sin(2 * Math.PI * bit / 56.)}" x1="${20 + 15*Math.sin(2 * Math.PI * bit / 56.)}"
y1="${18 - 15*Math.cos(2 * Math.PI * bit / 56.)}" y1="${20 - 15*Math.cos(2 * Math.PI * bit / 56.)}"
x2="${18 + 17*Math.sin(2 * Math.PI * bit / 56.)}" x2="${20 + 17*Math.sin(2 * Math.PI * bit / 56.)}"
y2="${18 - 17*Math.cos(2 * Math.PI * bit / 56.)}" y2="${20 - 17*Math.cos(2 * Math.PI * bit / 56.)}"
stroke="#444" stroke="#444"
stroke-width="2" stroke-width="2"
/> />

View File

@@ -12,7 +12,7 @@
<div class="flex flex-col grow" > <div class="flex flex-col grow" >
<div class="flex flex-row space-x-2 place-items-center"> <div class="flex flex-row space-x-2 place-items-center">
<div class="flex-0" title="Match density"> <div class="flex-0" title="Match density">
@template.serp.part.matchogram(mask = result.first.positionsMask) @template.serp.part.matchogram(mask = result.first.positionsMask, domain=result.getFirst().url.domain.toString())
</div> </div>
<div class="flex grow justify-between items-start"> <div class="flex grow justify-between items-start">
<div class="flex-1"> <div class="flex-1">

View File

@@ -13,7 +13,7 @@
@for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups()) @for (List<SearchFilters.Filter> filterGroup : filters.getFilterGroups())
@for (SearchFilters.Filter filter : filterGroup) @for (SearchFilters.Filter filter : filterGroup)
<label class="flex items-center"> <label class="flex items-center">
<button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline"> <button title="${filter.displayName}" onclick="document.location='$unsafe{filter.url}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
@if (filter.current) @if (filter.current)
<input type="checkbox" checked class="sr-only" aria-checked="true" /> <input type="checkbox" checked class="sr-only" aria-checked="true" />
@else @else
@@ -38,7 +38,7 @@
<div class="space-y-2"> <div class="space-y-2">
@for (SearchFilters.SearchOption option : filters.searchOptions()) @for (SearchFilters.SearchOption option : filters.searchOptions())
<label class="flex items-center"> <label class="flex items-center">
<button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-100 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline"> <button title="${option.name()}" onclick="document.location='$unsafe{option.getUrl()}'" class="flex-1 py-2 pl-2 rounded flex space-x-2 dark:has-[:checked]:bg-gray-950 has-[:checked]:bg-gray-300 has-[:checked]:text-slate-900 dark:has-[:checked]:text-slate-100 hover:bg-gray-50 dark:hover:bg-gray-950 bg-white dark:bg-gray-900 dark:border dark:border-gray-600 text-margeblue dark:text-slate-200 outline-1 active:outline">
@if (option.isSet()) @if (option.isSet())
<input type="checkbox" checked class="sr-only" aria-checked="true" /> <input type="checkbox" checked class="sr-only" aria-checked="true" />
@else @else

View File

@@ -15,7 +15,7 @@
@template.part.head(title = "Marginalia Search", allowIndexing = true) @template.part.head(title = "Marginalia Search", allowIndexing = true)
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
@@ -32,18 +32,14 @@
@if (model.news().isEmpty()) @if (model.news().isEmpty())
<div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w"> <div class="max-w-7xl mx-auto flex flex-col space-y-4 fill-w">
<div class="border dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 "> <div class="border border-gray-300 border-gray-100 dark:border-gray-600 dark:bg-gray-800 bg-white rounded p-2 m-4 ">
<div class="text-slate-700 dark:text-white text-sm p-4"> <div class="text-slate-700 dark:text-white text-sm p-4">
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div> The old version of Marginalia Search remains available
This is the new design and home of Marginalia Search. <a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">here</a>.
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
<p class="my-4"></p>
The old version of Marginalia Search remains available at
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
</div> </div>
</div> </div>
<div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2"> <div class="mx-auto flex flex-col sm:flex-row my-4 sm:space-x-2 space-y-2 sm:space-y-0 w-full md:w-auto px-2">
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3"> <div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3">
<div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div> <div><i class="fas fa-sailboat mx-2 text-margeblue dark:text-slate-200"></i>Explore the Web</div>
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Prioritizes non-commercial content</li> <li>Prioritizes non-commercial content</li>
@@ -52,7 +48,7 @@
</ul> </ul>
</div> </div>
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 "> <div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
<div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div> <div><i class="fas fa-hand-holding-hand mx-2 text-margeblue dark:text-slate-200"></i>Open Source</div>
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Custom index and crawler software</li> <li>Custom index and crawler software</li>
@@ -65,7 +61,7 @@
</div> </div>
</div> </div>
<div class="flex flex-col border dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 "> <div class="flex flex-col border border-gray-300 dark:border-gray-600 rounded overflow-hidden dark:bg-gray-800 bg-white p-6 space-y-3 ">
<div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div> <div><i class="fas fa-lock mx-2 text-margeblue dark:text-slate-200"></i> Privacy by default</div>
<ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5"> <ul class="list-disc ml-6 text-slate-700 dark:text-white text-xs leading-5">
<li>Filter out tracking and adtech</li> <li>Filter out tracking and adtech</li>

View File

@@ -13,7 +13,7 @@
@template.part.head(title = "Marginalia Search - " + parameters.query()) @template.part.head(title = "Marginalia Search - " + parameters.query())
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
<div> <div>

View File

@@ -11,7 +11,7 @@
@template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB()) @template.part.head(title = "Marginalia Search - " + model.domainA() + "/" + model.domainB())
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)

View File

@@ -9,7 +9,7 @@
@template.part.head(title = "Marginalia Search - " + model.domain()) @template.part.head(title = "Marginalia Search - " + model.domain())
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)

View File

@@ -7,7 +7,7 @@
@if (!list.isEmpty()) @if (!list.isEmpty())
<div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border dark:border-gray-600"> <div class="bg-white dark:bg-gray-800 shadow-sm rounded overflow-hidden border border-gray-300 dark:border-gray-600">
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline"> <div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-600 flex place-items-baseline">
<h2 class="text-md">${title}</h2> <h2 class="text-md">${title}</h2>
<div class="grow"></div> <div class="grow"></div>

View File

@@ -9,11 +9,11 @@
@template.part.head(title = "Marginalia Search - Site Viewer") @template.part.head(title = "Marginalia Search - Site Viewer")
<body class="min-h-screen bg-slate-100 dark:bg-gray-900 dark:text-white font-sans " > <body class="min-h-screen bg-bgblue dark:bg-gray-900 dark:text-white font-sans " >
@template.part.navbar(navbar = navbar) @template.part.navbar(navbar = navbar)
<header class="border-b border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md"> <header class="border-b border-gray-300 border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 shadow-md">
<div class="max-w-[1400px] mx-auto px-4 py-4"> <div class="max-w-[1400px] mx-auto px-4 py-4">
<h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1> <h1 class="text-base md:text-xl mr-2 md:mr-8 font-serif">View Site Information</h1>
</div> </div>
@@ -22,7 +22,7 @@
<div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4"> <div class="max-w-[1000px] mx-auto flex gap-4 flex-col md:flex-row place-items-center md:place-items-start p-4">
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1"> <div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden bg-white dark:bg-gray-800 flex flex-col space-y-2 flex-1">
<div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div> <div class="bg-margeblue text-white p-2 text-sm mb-2">View Site Information</div>
<p class="mx-4">This utility lets you explore what the search engine knows about the web, <p class="mx-4">This utility lets you explore what the search engine knows about the web,
@@ -45,7 +45,7 @@
</div> </div>
@if (!model.domains().isEmpty()) @if (!model.domains().isEmpty())
<div class="border dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto"> <div class="border border-gray-300 dark:border-gray-600 rounded md:my-4 overflow-hidden w-full md:w-auto">
<div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div> <div class="bg-margeblue text-white p-2 text-sm">Recently Discovered Domains</div>

View File

@@ -8,17 +8,17 @@
<div class="flex flex-col space-y-4 my-4 w-full"> <div class="flex flex-col space-y-4 my-4 w-full">
@if (backlinks.results().isEmpty()) @if (backlinks.results().isEmpty())
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm "> <div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm ">
The search engine isn't aware of any backlinks to ${backlinks.domain()}! The search engine isn't aware of any backlinks to ${backlinks.domain()}!
</div> </div>
@else @else
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm"> <div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
Showing documents linking to ${backlinks.domain()} Showing documents linking to ${backlinks.domain()}
</div> </div>
@endif @endif
@for (GroupedUrlDetails group : backlinks.results()) @for (GroupedUrlDetails group : backlinks.results())
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4"> <div class="border dark:border-gray-600 border-gray-300 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
<div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md"> <div class="flex space-x-2 flex-row place-items-baseline bg-margeblue text-white p-2 text-md">
<span class="fas fa-globe"></span> <span class="fas fa-globe"></span>
<a href="/site/${group.domain().toString()}">${group.domain().toString()}</a> <a href="/site/${group.domain().toString()}">${group.domain().toString()}</a>

View File

@@ -9,17 +9,17 @@
<div class="flex flex-col space-y-4 my-4"> <div class="flex flex-col space-y-4 my-4">
@if (docs.results().isEmpty()) @if (docs.results().isEmpty())
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm"> <div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
The search engine doesn't index any documents from ${docs.domain()} The search engine doesn't index any documents from ${docs.domain()}
</div> </div>
@else @else
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm"> <div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden p-4 mx-4 text-gray-800 text-sm">
Showing documents from ${docs.domain()} Showing documents from ${docs.domain()}
</div> </div>
@endif @endif
@for (UrlDetails details : docs.results()) @for (UrlDetails details : docs.results())
<div class="border dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4"> <div class="border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 dark:text-white flex flex-col overflow-hidden mx-4">
<div class="flex grow justify-between items-start p-4"> <div class="flex grow justify-between items-start p-4">
<div class="flex-1"> <div class="flex-1">
<h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4"> <h2 class="text-xl text-gray-800 dark:text-white font-serif mr-4">

View File

@@ -8,7 +8,7 @@
<!-- Main content --> <!-- Main content -->
<div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto"> <div class="flex-1 p-4 space-y-4 mx-auto w-full md:w-auto">
<div class="flex border dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" > <div class="flex border border-gray-300 dark:border-gray-600 rounded bg-white dark:bg-gray-800 flex-col space-y-4 pb-4 overflow-hidden md:max-w-lg" >
<div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white"> <div class="flex place-items-center space-x-2 p-2 text-md border-b dark:border-gray-600 bg-margeblue text-white">
<img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center"> <img src="/site/${siteInfo.domain()}/favicon" style="width: 16px; height: 16px; vertical-align: center">
<span>${siteInfo.domain()}</span> <span>${siteInfo.domain()}</span>

View File

@@ -4,7 +4,7 @@
@param ReportDomain reportDomain @param ReportDomain reportDomain
<div class="flex-col mx-auto"> <div class="flex-col mx-auto">
<div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full"> <div class="max-w-2xl mx-auto bg-white dark:bg-gray-800 border border-gray-300 dark:border-gray-600 rounded overflow-auto shadow-sm my-4 space-y-4 w-full">
<div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800"> <div class="px-4 py-2 bg-margeblue text-white border-b border-gray-200 dark:border-gray-800">
<h2 class="text-md">Report Domain Issue</h2> <h2 class="text-md">Report Domain Issue</h2>
</div> </div>

View File

@@ -9,6 +9,7 @@ module.exports = {
nicotine: '#f8f8ee', nicotine: '#f8f8ee',
margeblue: '#3e5f6f', margeblue: '#3e5f6f',
liteblue: '#0066cc', liteblue: '#0066cc',
bgblue: '#e5e9eb',
}, },
screens: { screens: {
'coarsepointer': { 'coarsepointer': {