1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

3 Commits

Author SHA1 Message Date
Viktor Lofgren
ecb0e57a1a (crawler) Make the use of virtual threads in the crawler configurable via system properties 2025-03-27 21:26:05 +01:00
Viktor Lofgren
8c61f61b46 (crawler) Add crawling metadata to domainstate db 2025-03-27 16:38:37 +01:00
Viktor Lofgren
662a18c933 Revert "(crawler) Further rearrange crawl order"
This reverts commit 1c2426a052.

The change does not appear necessary to avoid problems.
2025-03-27 11:25:08 +01:00
6 changed files with 151 additions and 16 deletions

View File

@@ -103,10 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
this.blacklist = blacklist; this.blacklist = blacklist;
this.node = processConfiguration.node(); this.node = processConfiguration.node();
SimpleBlockingThreadPool.ThreadType threadType;
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
}
else {
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
}
pool = new SimpleBlockingThreadPool("CrawlerPool", pool = new SimpleBlockingThreadPool("CrawlerPool",
Integer.getInteger("crawler.poolSize", 256), Integer.getInteger("crawler.poolSize", 256),
1, 1,
SimpleBlockingThreadPool.ThreadType.VIRTUAL); threadType);
// Wait for the blacklist to be loaded before starting the crawl // Wait for the blacklist to be loaded before starting the crawl
@@ -302,8 +310,8 @@ public class CrawlerMain extends ProcessMainClass {
} }
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl, /** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
* we want to enqueue domains that tend ro be large and have common top domains first, * we want to enqueue domains that have common top domains first, but otherwise have a random
* but otherwise have a random order. * order.
* <p></p> * <p></p>
* Note, we can't use hash codes for randomization as it is not desirable to have the same order * Note, we can't use hash codes for randomization as it is not desirable to have the same order
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and * every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
@@ -311,13 +319,15 @@ public class CrawlerMain extends ProcessMainClass {
* */ * */
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) { private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
Random r = new Random(); Random r = new Random();
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
Map<String, Integer> randomOrder = new HashMap<>(records.size()); Map<String, Integer> randomOrder = new HashMap<>(records.size());
for (var spec : records) { for (var spec : records) {
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
randomOrder.put(spec.domain, r.nextInt()); randomOrder.put(spec.domain, r.nextInt());
} }
return Comparator.comparing((CrawlSpecRecord spec) -> spec.domain.contains(".edu")) return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
.reversed() .reversed()
.thenComparing(spec -> randomOrder.get(spec.domain)) .thenComparing(spec -> randomOrder.get(spec.domain))
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to .thenComparing(Record::hashCode); // non-deterministic tie-breaker to

View File

@@ -11,6 +11,7 @@ import java.nio.file.Path;
import java.sql.Connection; import java.sql.Connection;
import java.sql.DriverManager; import java.sql.DriverManager;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant; import java.time.Instant;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
private final Connection connection; private final Connection connection;
public record CrawlMeta(
String domainName,
Instant lastFullCrawl,
Duration recrawlTime,
Duration crawlTime,
int recrawlErrors,
int crawlChanges,
int totalCrawlSize
) {}
public record SummaryRecord( public record SummaryRecord(
String domainName, String domainName,
Instant lastUpdated, Instant lastUpdated,
@@ -102,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
feedUrl TEXT feedUrl TEXT
) )
"""); """);
stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS crawl_meta (
domain TEXT PRIMARY KEY,
lastFullCrawlEpochMs LONG NOT NULL,
recrawlTimeMs LONG NOT NULL,
recrawlErrors INTEGER NOT NULL,
crawlTimeMs LONG NOT NULL,
crawlChanges INTEGER NOT NULL,
totalCrawlSize INTEGER NOT NULL
)
""");
stmt.executeUpdate(""" stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS favicon ( CREATE TABLE IF NOT EXISTS favicon (
domain TEXT PRIMARY KEY, domain TEXT PRIMARY KEY,
@@ -164,6 +187,26 @@ public class DomainStateDb implements AutoCloseable {
return Optional.empty(); return Optional.empty();
} }
public void save(CrawlMeta crawlMeta) {
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
VALUES (?, ?, ?, ?, ?, ?, ?)
""")) {
stmt.setString(1, crawlMeta.domainName());
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
stmt.setInt(4, crawlMeta.recrawlErrors);
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
stmt.setInt(6, crawlMeta.crawlChanges);
stmt.setInt(7, crawlMeta.totalCrawlSize);
stmt.executeUpdate();
} catch (SQLException e) {
logger.error("Failed to insert crawl meta record", e);
}
}
public void save(SummaryRecord record) { public void save(SummaryRecord record) {
if (connection == null) throw new IllegalStateException("No connection to domainstate db"); if (connection == null) throw new IllegalStateException("No connection to domainstate db");
@@ -182,7 +225,35 @@ public class DomainStateDb implements AutoCloseable {
} }
} }
public Optional<SummaryRecord> get(String domainName) { public Optional<CrawlMeta> getMeta(String domainName) {
if (connection == null)
return Optional.empty();
try (var stmt = connection.prepareStatement("""
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
FROM crawl_meta
WHERE domain = ?
""")) {
stmt.setString(1, domainName);
var rs = stmt.executeQuery();
if (rs.next()) {
return Optional.of(new CrawlMeta(
rs.getString("domain"),
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
Duration.ofMillis(rs.getLong("crawlTimeMs")),
rs.getInt("recrawlErrors"),
rs.getInt("crawlChanges"),
rs.getInt("totalCrawlSize")
));
}
} catch (SQLException ex) {
logger.error("Failed to get crawl meta record", ex);
}
return Optional.empty();
}
public Optional<SummaryRecord> getSummary(String domainName) {
if (connection == null) if (connection == null)
return Optional.empty(); return Optional.empty();

View File

@@ -29,6 +29,7 @@ import java.net.http.HttpResponse;
import java.net.http.HttpTimeoutException; import java.net.http.HttpTimeoutException;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
@@ -56,12 +57,21 @@ public class HttpFetcherImpl implements HttpFetcher {
private final HttpClient client; private final HttpClient client;
private HttpClient createClient() { private HttpClient createClient() {
final ExecutorService executorService;
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
executorService = Executors.newVirtualThreadPerTaskExecutor();
}
else {
executorService = Executors.newCachedThreadPool();
}
return HttpClient.newBuilder() return HttpClient.newBuilder()
.sslContext(NoSecuritySSL.buildSslContext()) .sslContext(NoSecuritySSL.buildSslContext())
.cookieHandler(cookies) .cookieHandler(cookies)
.followRedirects(HttpClient.Redirect.NORMAL) .followRedirects(HttpClient.Redirect.NORMAL)
.connectTimeout(Duration.ofSeconds(8)) .connectTimeout(Duration.ofSeconds(8))
.executor(Executors.newVirtualThreadPerTaskExecutor()) .executor(executorService)
.build(); .build();
} }

View File

@@ -26,6 +26,8 @@ import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -113,15 +115,19 @@ public class CrawlerRetreiver implements AutoCloseable {
throw new InterruptedException(); throw new InterruptedException();
} }
Instant recrawlStart = Instant.now();
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) { if (recrawlMetadata.size() > 0) {
// If we have reference data, we will always grow the crawl depth a bit // If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5, 2500); crawlFrontier.increaseDepth(1.5, 2500);
} }
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks); yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
} }
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> { case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString())); domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
@@ -143,7 +149,11 @@ public class CrawlerRetreiver implements AutoCloseable {
private int crawlDomain(EdgeUrl rootUrl, private int crawlDomain(EdgeUrl rootUrl,
SimpleRobotRules robotsRules, SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer, CrawlDelayTimer delayTimer,
DomainLinks domainLinks) { DomainLinks domainLinks,
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
Duration recrawlTime) {
Instant crawlStart = Instant.now();
// Add external links to the crawl frontier // Add external links to the crawl frontier
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto)); crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
@@ -153,6 +163,8 @@ public class CrawlerRetreiver implements AutoCloseable {
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer)); crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
} }
int crawlerAdditions = 0;
while (!crawlFrontier.isEmpty() while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached() && !crawlFrontier.isCrawlDepthReached()
&& errorCount < MAX_ERRORS && errorCount < MAX_ERRORS
@@ -184,7 +196,11 @@ public class CrawlerRetreiver implements AutoCloseable {
continue; continue;
try { try {
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()); var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
if (result.isOk()) {
crawlerAdditions++;
}
} }
catch (InterruptedException ex) { catch (InterruptedException ex) {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
@@ -192,6 +208,17 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
} }
Duration crawlTime = Duration.between(crawlStart, Instant.now());
domainStateDb.save(new DomainStateDb.CrawlMeta(
domain,
Instant.now(),
recrawlTime,
crawlTime,
recrawlMetadata.errors(),
crawlerAdditions,
recrawlMetadata.size() + crawlerAdditions
));
return crawlFrontier.visitedSize(); return crawlFrontier.visitedSize();
} }
@@ -324,7 +351,7 @@ public class CrawlerRetreiver implements AutoCloseable {
); );
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException { private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
var oldDomainStateRecord = domainStateDb.get(domain); var oldDomainStateRecord = domainStateDb.getSummary(domain);
// If we are already aware of an old feed URL, then we can just revalidate it // If we are already aware of an old feed URL, then we can just revalidate it
if (oldDomainStateRecord.isPresent()) { if (oldDomainStateRecord.isPresent()) {

View File

@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
} }
/** Performs a re-crawl of old documents, comparing etags and last-modified */ /** Performs a re-crawl of old documents, comparing etags and last-modified */
public int recrawl(CrawlDataReference oldCrawlData, public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
SimpleRobotRules robotsRules, SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer) CrawlDelayTimer delayTimer)
throws InterruptedException { throws InterruptedException {
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
int retained = 0; int retained = 0;
int errors = 0; int errors = 0;
int skipped = 0; int skipped = 0;
int size = 0;
for (CrawledDocument doc : oldCrawlData) { for (CrawledDocument doc : oldCrawlData) {
if (errors > 20) { if (errors > 20) {
@@ -82,6 +83,7 @@ public class CrawlerRevisitor {
continue; continue;
} }
size++;
double skipProb; double skipProb;
@@ -154,6 +156,8 @@ public class CrawlerRevisitor {
} }
} }
return recrawled; return new RecrawlMetadata(size, errors, skipped);
} }
public record RecrawlMetadata(int size, int errors, int skipped) {}
} }

View File

@@ -8,6 +8,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant; import java.time.Instant;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
@@ -47,8 +48,8 @@ class DomainStateDbTest {
db.save(allFields); db.save(allFields);
db.save(minFields); db.save(minFields);
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow()); assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow()); assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
var updatedAllFields = new DomainStateDb.SummaryRecord( var updatedAllFields = new DomainStateDb.SummaryRecord(
"all.marginalia.nu", "all.marginalia.nu",
@@ -59,7 +60,19 @@ class DomainStateDbTest {
); );
db.save(updatedAllFields); db.save(updatedAllFields);
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow()); assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
}
}
@Test
public void testMetadata() throws SQLException {
try (var db = new DomainStateDb(tempFile)) {
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
db.save(original);
var maybeMeta = db.getMeta("example.com");
assertTrue(maybeMeta.isPresent());
assertEquals(original, maybeMeta.get());
} }
} }