1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-08 10:02:41 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Viktor Lofgren
a7d91c8527 (crawler) Clean up fetcher detailed logging 2025-04-21 12:53:52 +02:00
Viktor Lofgren
7151602124 (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Cleaning up after changes.
2025-04-21 12:47:03 +02:00
Viktor Lofgren
884e33bd4a (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change back to an unbounded queue, tighten sleep times a bit.
2025-04-21 11:48:15 +02:00
Viktor Lofgren
e84d5c497a (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.
2025-04-21 00:39:26 +02:00
Viktor Lofgren
2d2d3e2466 (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready
Change to a bounded queue and adding a sleep to reduce the amount of effectively busy looping threads.
2025-04-21 00:36:48 +02:00
Viktor Lofgren
647dd9b12f (crawler) Reduce the likelihood of crawler tasks locking on domains before they are ready 2025-04-21 00:24:30 +02:00
Viktor Lofgren
de4e2849ce (crawler) Tweak request retry counts
Increase the default number of tries to 3, but don't retry on SSL errors as they are unlikely to fix themselves in the short term.
2025-04-19 00:19:48 +02:00
3 changed files with 141 additions and 76 deletions

View File

@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
import java.security.Security; import java.security.Security;
import java.util.*; import java.util.*;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>(); private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
private final AtomicInteger tasksDone = new AtomicInteger(0); private final AtomicInteger tasksDone = new AtomicInteger(0);
private final HttpFetcherImpl fetcher; private final HttpFetcherImpl fetcher;
@@ -277,12 +280,29 @@ public class CrawlerMain extends ProcessMainClass {
} }
// Schedule viable tasks for execution until list is empty // Schedule viable tasks for execution until list is empty
while (!taskList.isEmpty()) { for (int emptyRuns = 0;emptyRuns < 300;) {
taskList.removeIf(this::trySubmitDeferredTask); boolean hasTasks = !taskList.isEmpty();
// Add a small pause here to avoid busy looping toward the end of the execution cycle when // The order of these checks very important to avoid a race condition
// we might have no new viable tasks to run for hours on end // where we miss a task that is put into the retry queue
TimeUnit.MILLISECONDS.sleep(50); boolean hasRunningTasks = pool.getActiveCount() > 0;
boolean hasRetryTasks = !retryQueue.isEmpty();
if (hasTasks || hasRetryTasks || hasRunningTasks) {
retryQueue.drainTo(taskList);
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
taskList.removeIf(this::trySubmitDeferredTask);
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
// we might have no new viable tasks to run for hours on end
TimeUnit.MILLISECONDS.sleep(5);
} else {
// We have no tasks to run, and no tasks in the retry queue
// but we wait a bit to see if any new tasks come in via the retry queue
emptyRuns++;
TimeUnit.SECONDS.sleep(1);
}
} }
logger.info("Shutting down the pool, waiting for tasks to complete..."); logger.info("Shutting down the pool, waiting for tasks to complete...");
@@ -414,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
/** Best effort indicator whether we could start this now without getting stuck in /** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */ * DomainLocks purgatory */
public boolean canRun() { public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain)); return domainLocks.isLockableHint(new EdgeDomain(domain));
} }
@Override @Override
@@ -425,66 +445,82 @@ public class CrawlerMain extends ProcessMainClass {
return; return;
} }
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); // We don't have a lock, so we can't run this task
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain); // we return to avoid blocking the pool for too long
if (lock.isEmpty()) {
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data if (retryQueue.remainingCapacity() > 0) {
// while writing to the same file name as before // Sleep a moment to avoid busy looping via the retry queue
if (Files.exists(newWarcFile)) { // in the case when few tasks remain and almost all are ineligible for
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING); // immediate restart
} Thread.sleep(5);
else {
Files.deleteIfExists(tempFile);
}
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
CrawlDataReference reference = getReference()
)
{
// Resume the crawl if it was aborted
if (Files.exists(tempFile)) {
retriever.syncAbortedRun(tempFile);
Files.delete(tempFile);
} }
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain); retryQueue.put(this);
return;
}
DomainLocks.DomainLock domainLock = lock.get();
int size; try (domainLock) {
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) { Thread.currentThread().setName("crawling:" + domain);
size = retriever.crawlDomain(domainLinks, reference);
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
// while writing to the same file name as before
if (Files.exists(newWarcFile)) {
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
}
else {
Files.deleteIfExists(tempFile);
} }
// Delete the reference crawl data if it's not the same as the new one try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
// (mostly a case when migrating from legacy->warc) var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
reference.delete(); CrawlDataReference reference = getReference())
{
// Resume the crawl if it was aborted
if (Files.exists(tempFile)) {
retriever.syncAbortedRun(tempFile);
Files.delete(tempFile);
}
// Convert the WARC file to Parquet DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
SlopCrawlDataRecord
.convertWarc(domain, userAgent, newWarcFile, slopFile);
// Optionally archive the WARC file if full retention is enabled, int size = retriever.crawlDomain(domainLinks, reference);
// otherwise delete it:
warcArchiver.consumeWarc(newWarcFile, domain);
// Mark the domain as finished in the work log // Delete the reference crawl data if it's not the same as the new one
workLog.setJobToFinished(domain, slopFile.toString(), size); // (mostly a case when migrating from legacy->warc)
reference.delete();
// Update the progress bar // Convert the WARC file to Parquet
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); SlopCrawlDataRecord
.convertWarc(domain, userAgent, newWarcFile, slopFile);
logger.info("Fetched {}", domain); // Optionally archive the WARC file if full retention is enabled,
} catch (Exception e) { // otherwise delete it:
logger.error("Error fetching domain " + domain, e); warcArchiver.consumeWarc(newWarcFile, domain);
}
finally {
// We don't need to double-count these; it's also kept in the workLog
pendingCrawlTasks.remove(domain);
Thread.currentThread().setName("[idle]");
Files.deleteIfExists(newWarcFile); // Mark the domain as finished in the work log
Files.deleteIfExists(tempFile); workLog.setJobToFinished(domain, slopFile.toString(), size);
// Update the progress bar
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
logger.info("Fetched {}", domain);
} catch (Exception e) {
logger.error("Error fetching domain " + domain, e);
}
finally {
// We don't need to double-count these; it's also kept in the workLog
pendingCrawlTasks.remove(domain);
Thread.currentThread().setName("[idle]");
Files.deleteIfExists(newWarcFile);
Files.deleteIfExists(tempFile);
}
} }
} }

View File

@@ -47,11 +47,13 @@ import org.slf4j.Marker;
import org.slf4j.MarkerFactory; import org.slf4j.MarkerFactory;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import java.io.IOException; import java.io.IOException;
import java.net.SocketTimeoutException; import java.net.SocketTimeoutException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.time.Duration; import java.time.Duration;
import java.time.Instant;
import java.util.*; import java.util.*;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -392,25 +394,31 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
if (probeType == HttpFetcher.ProbeType.FULL) { if (probeType == HttpFetcher.ProbeType.FULL) {
try { try {
var probeResult = probeContentType(url, cookies, timer, contentTags); var probeResult = probeContentType(url, cookies, timer, contentTags);
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
switch (probeResult) { switch (probeResult) {
case HttpFetcher.ContentTypeProbeResult.NoOp(): case HttpFetcher.ContentTypeProbeResult.NoOp():
break; // break; //
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl): case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
break; break;
case ContentTypeProbeResult.BadContentType badContentType: case ContentTypeProbeResult.BadContentType badContentType:
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode()); warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
return new HttpFetchResult.ResultNone(); return new HttpFetchResult.ResultNone();
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex): case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
warcRecorder.flagAsTimeout(url); warcRecorder.flagAsTimeout(url);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
case ContentTypeProbeResult.Exception(Exception ex): case ContentTypeProbeResult.Exception(Exception ex):
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
warcRecorder.flagAsError(url, ex); warcRecorder.flagAsError(url, ex);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
case ContentTypeProbeResult.HttpError httpError: case ContentTypeProbeResult.HttpError httpError:
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message())); return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
case ContentTypeProbeResult.Redirect redirect: case ContentTypeProbeResult.Redirect redirect:
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
return new HttpFetchResult.ResultRedirect(redirect.location()); return new HttpFetchResult.ResultRedirect(redirect.location());
} }
} catch (Exception ex) { } catch (Exception ex) {
@@ -429,27 +437,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
contentTags.paint(request); contentTags.paint(request);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
Instant start = Instant.now();
HttpFetchResult result = warcRecorder.fetch(client, cookies, request); HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
Duration fetchDuration = Duration.between(start, Instant.now());
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw(); result = new HttpFetchResult.Result304Raw();
} }
} }
switch (result) { switch (result) {
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url); case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url); case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url); case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex()); case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url); case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url); case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
} }
return result; return result;
} }
} }
catch (Exception ex) { catch (Exception ex) {
ex.printStackTrace(); logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
} }
@@ -622,18 +635,21 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override @Override
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) { public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
if (exception instanceof SocketTimeoutException ex) { if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
return false;
}
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
return false; return false;
} }
return executionCount < 3; return executionCount <= 3;
} }
@Override @Override
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) { public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
return switch (response.getCode()) { return switch (response.getCode()) {
case 500, 503 -> executionCount < 2; case 500, 503 -> executionCount <= 2;
case 429 -> executionCount < 3; case 429 -> executionCount <= 3;
default -> false; default -> false;
}; };
} }

View File

@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import java.util.Map; import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
@@ -19,8 +20,22 @@ public class DomainLocks {
* and may be held by another thread. The caller is responsible for locking and releasing the lock. * and may be held by another thread. The caller is responsible for locking and releasing the lock.
*/ */
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException { public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
return new DomainLock(domain.toString(), var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
sem.acquire();
return new DomainLock(sem);
}
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
if (sem.tryAcquire(1)) {
return Optional.of(new DomainLock(sem));
}
else {
// We don't have a lock, so we return an empty optional
return Optional.empty();
}
} }
private Semaphore defaultPermits(String topDomain) { private Semaphore defaultPermits(String topDomain) {
@@ -44,7 +59,11 @@ public class DomainLocks {
return new Semaphore(2); return new Semaphore(2);
} }
public boolean canLock(EdgeDomain domain) { /** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
* after this method returns true)
*/
public boolean isLockableHint(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase()); Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem) if (null == sem)
return true; return true;
@@ -53,22 +72,16 @@ public class DomainLocks {
} }
public static class DomainLock implements AutoCloseable { public static class DomainLock implements AutoCloseable {
private final String domainName;
private final Semaphore semaphore; private final Semaphore semaphore;
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException { DomainLock(Semaphore semaphore) {
this.domainName = domainName;
this.semaphore = semaphore; this.semaphore = semaphore;
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
semaphore.acquire();
Thread.currentThread().setName("crawling:" + domainName);
} }
@Override @Override
public void close() throws Exception { public void close() throws Exception {
semaphore.release(); semaphore.release();
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]"); Thread.currentThread().setName("[idle]");
} }
} }
} }