mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
4 Commits
deploy-011
...
deploy-011
Author | SHA1 | Date | |
---|---|---|---|
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 |
@@ -103,10 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
|
||||
}
|
||||
else {
|
||||
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
|
||||
}
|
||||
|
||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||
Integer.getInteger("crawler.poolSize", 256),
|
||||
1,
|
||||
SimpleBlockingThreadPool.ThreadType.VIRTUAL);
|
||||
threadType);
|
||||
|
||||
|
||||
// Wait for the blacklist to be loaded before starting the crawl
|
||||
|
@@ -11,6 +11,7 @@ import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
private final Connection connection;
|
||||
|
||||
|
||||
public record CrawlMeta(
|
||||
String domainName,
|
||||
Instant lastFullCrawl,
|
||||
Duration recrawlTime,
|
||||
Duration crawlTime,
|
||||
int recrawlErrors,
|
||||
int crawlChanges,
|
||||
int totalCrawlSize
|
||||
) {}
|
||||
|
||||
public record SummaryRecord(
|
||||
String domainName,
|
||||
Instant lastUpdated,
|
||||
@@ -102,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
|
||||
feedUrl TEXT
|
||||
)
|
||||
""");
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS crawl_meta (
|
||||
domain TEXT PRIMARY KEY,
|
||||
lastFullCrawlEpochMs LONG NOT NULL,
|
||||
recrawlTimeMs LONG NOT NULL,
|
||||
recrawlErrors INTEGER NOT NULL,
|
||||
crawlTimeMs LONG NOT NULL,
|
||||
crawlChanges INTEGER NOT NULL,
|
||||
totalCrawlSize INTEGER NOT NULL
|
||||
)
|
||||
""");
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS favicon (
|
||||
domain TEXT PRIMARY KEY,
|
||||
@@ -164,6 +187,26 @@ public class DomainStateDb implements AutoCloseable {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public void save(CrawlMeta crawlMeta) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, crawlMeta.domainName());
|
||||
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
|
||||
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
|
||||
stmt.setInt(4, crawlMeta.recrawlErrors);
|
||||
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
|
||||
stmt.setInt(6, crawlMeta.crawlChanges);
|
||||
stmt.setInt(7, crawlMeta.totalCrawlSize);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to insert crawl meta record", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
@@ -182,7 +225,35 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> get(String domainName) {
|
||||
public Optional<CrawlMeta> getMeta(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
|
||||
FROM crawl_meta
|
||||
WHERE domain = ?
|
||||
""")) {
|
||||
stmt.setString(1, domainName);
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
return Optional.of(new CrawlMeta(
|
||||
rs.getString("domain"),
|
||||
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
|
||||
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
|
||||
Duration.ofMillis(rs.getLong("crawlTimeMs")),
|
||||
rs.getInt("recrawlErrors"),
|
||||
rs.getInt("crawlChanges"),
|
||||
rs.getInt("totalCrawlSize")
|
||||
));
|
||||
}
|
||||
} catch (SQLException ex) {
|
||||
logger.error("Failed to get crawl meta record", ex);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> getSummary(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
|
@@ -29,6 +29,7 @@ import java.net.http.HttpResponse;
|
||||
import java.net.http.HttpTimeoutException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
@@ -56,12 +57,21 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
private final HttpClient client;
|
||||
|
||||
private HttpClient createClient() {
|
||||
final ExecutorService executorService;
|
||||
|
||||
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
|
||||
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||
}
|
||||
else {
|
||||
executorService = Executors.newCachedThreadPool();
|
||||
}
|
||||
|
||||
return HttpClient.newBuilder()
|
||||
.sslContext(NoSecuritySSL.buildSslContext())
|
||||
.cookieHandler(cookies)
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.connectTimeout(Duration.ofSeconds(8))
|
||||
.executor(Executors.newVirtualThreadPerTaskExecutor())
|
||||
.executor(executorService)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
@@ -26,6 +26,8 @@ import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -113,15 +115,19 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Instant recrawlStart = Instant.now();
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||
if (recrawlMetadata.size() > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
|
||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||
|
||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
|
||||
}
|
||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||
@@ -143,7 +149,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private int crawlDomain(EdgeUrl rootUrl,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer,
|
||||
DomainLinks domainLinks) {
|
||||
DomainLinks domainLinks,
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
|
||||
Duration recrawlTime) {
|
||||
|
||||
Instant crawlStart = Instant.now();
|
||||
|
||||
// Add external links to the crawl frontier
|
||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||
@@ -153,6 +163,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
}
|
||||
|
||||
int crawlerAdditions = 0;
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
&& errorCount < MAX_ERRORS
|
||||
@@ -184,7 +196,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
continue;
|
||||
|
||||
try {
|
||||
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||
var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||
|
||||
if (result.isOk()) {
|
||||
crawlerAdditions++;
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
@@ -192,6 +208,17 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
Duration crawlTime = Duration.between(crawlStart, Instant.now());
|
||||
domainStateDb.save(new DomainStateDb.CrawlMeta(
|
||||
domain,
|
||||
Instant.now(),
|
||||
recrawlTime,
|
||||
crawlTime,
|
||||
recrawlMetadata.errors(),
|
||||
crawlerAdditions,
|
||||
recrawlMetadata.size() + crawlerAdditions
|
||||
));
|
||||
|
||||
return crawlFrontier.visitedSize();
|
||||
}
|
||||
|
||||
@@ -324,7 +351,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
);
|
||||
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||
var oldDomainStateRecord = domainStateDb.get(domain);
|
||||
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||
|
||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||
if (oldDomainStateRecord.isPresent()) {
|
||||
|
@@ -31,7 +31,7 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
|
||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||
public int recrawl(CrawlDataReference oldCrawlData,
|
||||
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer)
|
||||
throws InterruptedException {
|
||||
@@ -39,6 +39,7 @@ public class CrawlerRevisitor {
|
||||
int retained = 0;
|
||||
int errors = 0;
|
||||
int skipped = 0;
|
||||
int size = 0;
|
||||
|
||||
for (CrawledDocument doc : oldCrawlData) {
|
||||
if (errors > 20) {
|
||||
@@ -82,6 +83,7 @@ public class CrawlerRevisitor {
|
||||
continue;
|
||||
}
|
||||
|
||||
size++;
|
||||
|
||||
double skipProb;
|
||||
|
||||
@@ -154,6 +156,8 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
}
|
||||
|
||||
return recrawled;
|
||||
return new RecrawlMetadata(size, errors, skipped);
|
||||
}
|
||||
|
||||
public record RecrawlMetadata(int size, int errors, int skipped) {}
|
||||
}
|
||||
|
@@ -8,6 +8,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
@@ -47,8 +48,8 @@ class DomainStateDbTest {
|
||||
db.save(allFields);
|
||||
db.save(minFields);
|
||||
|
||||
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
||||
assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
|
||||
|
||||
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||
"all.marginalia.nu",
|
||||
@@ -59,7 +60,19 @@ class DomainStateDbTest {
|
||||
);
|
||||
|
||||
db.save(updatedAllFields);
|
||||
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMetadata() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
|
||||
db.save(original);
|
||||
|
||||
var maybeMeta = db.getMeta("example.com");
|
||||
assertTrue(maybeMeta.isPresent());
|
||||
assertEquals(original, maybeMeta.get());
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user