mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
20 Commits
deploy-012
...
deploy-014
Author | SHA1 | Date | |
---|---|---|---|
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 |
@@ -20,7 +20,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(fileName);
|
||||
var db = new DomainStateDb(dbTempFile))
|
||||
{
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||
|
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
|
||||
|
||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||
private final HttpFetcherImpl fetcher;
|
||||
|
||||
@@ -277,12 +280,29 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
for (int emptyRuns = 0;emptyRuns < 300;) {
|
||||
boolean hasTasks = !taskList.isEmpty();
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
// The order of these checks very important to avoid a race condition
|
||||
// where we miss a task that is put into the retry queue
|
||||
boolean hasRunningTasks = pool.getActiveCount() > 0;
|
||||
boolean hasRetryTasks = !retryQueue.isEmpty();
|
||||
|
||||
if (hasTasks || hasRetryTasks || hasRunningTasks) {
|
||||
retryQueue.drainTo(taskList);
|
||||
|
||||
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(5);
|
||||
} else {
|
||||
// We have no tasks to run, and no tasks in the retry queue
|
||||
// but we wait a bit to see if any new tasks come in via the retry queue
|
||||
emptyRuns++;
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -414,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -425,66 +445,82 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference()
|
||||
)
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
if (retryQueue.remainingCapacity() > 0) {
|
||||
// Sleep a moment to avoid busy looping via the retry queue
|
||||
// in the case when few tasks remain and almost all are ineligible for
|
||||
// immediate restart
|
||||
Thread.sleep(5);
|
||||
}
|
||||
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
|
||||
int size;
|
||||
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
|
||||
size = retriever.crawlDomain(domainLinks, reference);
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
// Convert the WARC file to Parquet
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
int size = retriever.crawlDomain(domainLinks, reference);
|
||||
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
// Convert the WARC file to Parquet
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(ClassicRequestBuilder getBuilder) {
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.addHeader("If-None-Match", etag);
|
||||
request.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,34 +0,0 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class Cookies extends CookieHandler {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public void clear() {
|
||||
cookieJar.get().clear();
|
||||
}
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookieJar.get().isEmpty();
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||
return cookieJar.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||
cookieJar.get().putAll(responseHeaders);
|
||||
}
|
||||
}
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DomainCookies {
|
||||
private final Map<String, String> cookies = new HashMap<>();
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookies.isEmpty();
|
||||
}
|
||||
|
||||
public void updateCookieStore(HttpResponse response) {
|
||||
for (var header : response.getHeaders()) {
|
||||
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
|
||||
parseCookieHeader(header.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void parseCookieHeader(String value) {
|
||||
// Parse the Set-Cookie header value and extract the cookies
|
||||
|
||||
String[] parts = value.split(";");
|
||||
String cookie = parts[0].trim();
|
||||
|
||||
if (cookie.contains("=")) {
|
||||
String[] cookieParts = cookie.split("=");
|
||||
String name = cookieParts[0].trim();
|
||||
String val = cookieParts[1].trim();
|
||||
cookies.put(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
public void paintRequest(HttpUriRequestBase request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
public void paintRequest(ClassicHttpRequest request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
private String createCookieHeader() {
|
||||
StringJoiner sj = new StringJoiner("; ");
|
||||
for (var cookie : cookies.entrySet()) {
|
||||
sj.add(cookie.getKey() + "=" + cookie.getValue());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
}
|
@@ -23,6 +23,7 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags,
|
||||
ProbeType probeType);
|
||||
|
@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
@@ -45,11 +47,13 @@ import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -76,29 +80,48 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
}
|
||||
|
||||
private final CloseableHttpClient client;
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
public PoolStats getPoolStats() {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(4)
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(15))
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
@@ -286,6 +309,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
* recorded in the WARC file on failure.
|
||||
*/
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags) {
|
||||
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
@@ -298,9 +322,11 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
var result = SendLock.wrapSend(client, head, (rsp) -> {
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
cookies.paintRequest(head);
|
||||
|
||||
return SendLock.wrapSend(client, head, (rsp) -> {
|
||||
cookies.updateCookieStore(rsp);
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
int statusCode = rsp.getCode();
|
||||
|
||||
// Handle redirects
|
||||
@@ -338,8 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
|
||||
@@ -361,6 +385,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
@@ -368,26 +393,32 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
try {
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
try {
|
||||
var probeResult = probeContentType(url, timer, contentTags);
|
||||
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
|
||||
var probeResult = probeContentType(url, cookies, timer, contentTags);
|
||||
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||
break; //
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break;
|
||||
case ContentTypeProbeResult.BadContentType badContentType:
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.Exception(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.HttpError httpError:
|
||||
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
|
||||
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||
case ContentTypeProbeResult.Redirect redirect:
|
||||
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
|
||||
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
@@ -397,36 +428,41 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
}
|
||||
|
||||
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
contentTags.paint(request);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
Instant start = Instant.now();
|
||||
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
|
||||
|
||||
Duration fetchDuration = Duration.between(start, Instant.now());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
result = new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
}
|
||||
|
||||
switch (result) {
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
|
||||
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
|
||||
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
|
||||
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
@@ -493,56 +529,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.build();
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
return client.execute(getRequest, response -> {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
try {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
finally {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@@ -573,13 +614,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try (var sl = new SendLock()) {
|
||||
|
||||
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.build();
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, request);
|
||||
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
|
||||
|
||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||
robotsParser.parseContent(url.toString(),
|
||||
@@ -595,18 +635,21 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException ex) {
|
||||
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
|
||||
return false;
|
||||
}
|
||||
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount < 3;
|
||||
return executionCount <= 3;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount < 2;
|
||||
case 429 -> executionCount < 3;
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
* and suppressed from the headers.
|
||||
* If an error occurs, a buffer will be created with no content and an error status.
|
||||
*/
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||
HttpGet request,
|
||||
Duration timeLimit) throws IOException {
|
||||
if (response == null)
|
||||
return new ErrorBuffer();
|
||||
|
||||
@@ -54,16 +57,29 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
InputStream is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
|
||||
try (response) {
|
||||
if (length > 0 && length < 8192) {
|
||||
// If the content is small and not compressed, we can just read it into memory
|
||||
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
|
||||
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||
} else {
|
||||
// Otherwise, we unpack it into a file and read it from there
|
||||
return new FileBuffer(response.getHeaders(), timeLimit, is);
|
||||
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||
}
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
is.skip(Long.MAX_VALUE);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
}
|
||||
finally {
|
||||
// Close the input stream
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +87,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
|
||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
||||
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||
Instant start = Instant.now();
|
||||
Instant timeout = start.plus(timeLimit);
|
||||
long size = 0;
|
||||
@@ -86,6 +102,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||
if (remaining.isNegative()) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
// Abort the request if the time limit is exceeded
|
||||
// so we don't keep the connection open forever or are forced to consume
|
||||
// the stream to the end
|
||||
|
||||
request.abort();
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -104,6 +125,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
@@ -111,13 +133,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
// Try to close the connection as long as we haven't timed out.
|
||||
// As per Apache HttpClient's semantics, this will reset the connection
|
||||
// and close the stream if we have timed out.
|
||||
|
||||
if (truncationReason != WarcTruncationReason.TIME) {
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
/** Takes a Content-Range header and checks if it is complete.
|
||||
@@ -218,7 +233,7 @@ class ErrorBuffer extends WarcInputBuffer {
|
||||
/** Buffer for when we have the response in memory */
|
||||
class MemoryBuffer extends WarcInputBuffer {
|
||||
byte[] data;
|
||||
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
|
||||
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
@@ -229,7 +244,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
|
||||
var outputStream = new ByteArrayOutputStream(size);
|
||||
|
||||
copy(responseStream, outputStream, timeLimit);
|
||||
copy(responseStream, request, outputStream, timeLimit);
|
||||
|
||||
data = outputStream.toByteArray();
|
||||
}
|
||||
@@ -253,7 +268,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
class FileBuffer extends WarcInputBuffer {
|
||||
private final Path tempFile;
|
||||
|
||||
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
@@ -265,7 +280,7 @@ class FileBuffer extends WarcInputBuffer {
|
||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, out, timeLimit);
|
||||
copy(responseStream, request, out, timeLimit);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
@@ -8,9 +9,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.NameValuePair;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -53,23 +52,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Affix a version string in case we need to change the format in the future
|
||||
// in some way
|
||||
private final String warcRecorderVersion = "1.0";
|
||||
private final CookieStore cookies;
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
/**
|
||||
* Create a new WarcRecorder that will write to the given file
|
||||
*
|
||||
* @param warcFile The file to write to
|
||||
*/
|
||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||
public WarcRecorder(Path warcFile) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = fetcher.getCookies();
|
||||
}
|
||||
|
||||
public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -79,24 +70,21 @@ public class WarcRecorder implements AutoCloseable {
|
||||
public WarcRecorder() throws IOException {
|
||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||
this.writer = new WarcWriter(this.warcFile);
|
||||
this.cookies = new BasicCookieStore();
|
||||
|
||||
temporaryFile = true;
|
||||
}
|
||||
|
||||
private boolean hasCookies() {
|
||||
return !cookies.getCookies().isEmpty();
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
ClassicHttpRequest request)
|
||||
DomainCookies cookies,
|
||||
HttpGet request)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
return fetch(client, request, Duration.ofMillis(MAX_TIME));
|
||||
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
ClassicHttpRequest request,
|
||||
DomainCookies cookies,
|
||||
HttpGet request,
|
||||
Duration timeout)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
@@ -113,13 +101,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Inject a range header to attempt to limit the size of the response
|
||||
// to the maximum size we want to store, if the server supports it.
|
||||
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||
|
||||
cookies.paintRequest(request);
|
||||
try {
|
||||
return client.execute(request, response -> {
|
||||
return client.execute(request,response -> {
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -143,8 +133,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (hasCookies()) {
|
||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
@@ -259,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -320,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
.date(Instant.now())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
if (hasCookies()) {
|
||||
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
|
||||
builder.addHeader("X-Has-Cookies", "1");
|
||||
}
|
||||
|
||||
@@ -340,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
@@ -19,8 +20,22 @@ public class DomainLocks {
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
@@ -44,7 +59,11 @@ public class DomainLocks {
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
@@ -53,22 +72,16 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
DomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("crawling:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
@@ -51,9 +52,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private final DomainStateDb domainStateDb;
|
||||
private final WarcRecorder warcRecorder;
|
||||
private final CrawlerRevisitor crawlerRevisitor;
|
||||
private final DomainCookies cookies = new DomainCookies();
|
||||
|
||||
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||
);
|
||||
|
||||
int errorCount = 0;
|
||||
@@ -124,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
Instant recrawlStart = Instant.now();
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
|
||||
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
@@ -274,7 +276,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||
@@ -337,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
@@ -407,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (parsedOpt.isEmpty())
|
||||
return false;
|
||||
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
@@ -435,7 +437,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
{
|
||||
var contentTags = reference.getContentTags();
|
||||
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
timer.waitFetchDelay();
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
@@ -461,7 +463,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
{
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -37,6 +38,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||
DomainCookies cookies,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer)
|
||||
throws InterruptedException {
|
||||
@@ -132,6 +134,7 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
// Add a WARC record so we don't repeat this
|
||||
warcRecorder.writeReferenceCopy(url,
|
||||
cookies,
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBodyBytes,
|
||||
|
@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
@@ -85,55 +87,59 @@ class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtml() {
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBinary() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeRedirect() {
|
||||
var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBadHttpStatus() {
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOnlyGetAllowed() {
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimeout() {
|
||||
var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
|
||||
}
|
||||
|
||||
|
@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
|
@@ -31,6 +31,7 @@ class HttpFetcherImplFetchTest {
|
||||
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
|
||||
|
||||
private static EdgeUrl okUrl;
|
||||
private static EdgeUrl okUrlSetsCookie;
|
||||
private static EdgeUrl okRangeResponseUrl;
|
||||
private static EdgeUrl okUrlWith304;
|
||||
|
||||
@@ -88,6 +89,19 @@ class HttpFetcherImplFetchTest {
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
@@ -117,6 +131,8 @@ class HttpFetcherImplFetchTest {
|
||||
.withHeader("Keep-Alive", "max=4, timeout=30")
|
||||
.withBody("Hello")
|
||||
));
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
@@ -134,20 +150,31 @@ class HttpFetcherImplFetchTest {
|
||||
public void setUp() throws IOException {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
|
||||
warcRecorder = new WarcRecorder(warcFile, fetcher);
|
||||
warcRecorder = new WarcRecorder(warcFile);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
System.out.println(stats);
|
||||
|
||||
fetcher.close();
|
||||
warcRecorder.close();
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFoo() {
|
||||
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -158,12 +185,29 @@ class HttpFetcherImplFetchTest {
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
|
||||
assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOkSetsCookie() throws IOException {
|
||||
var cookies = new DomainCookies();
|
||||
var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -171,7 +215,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testOk304_NoProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
@@ -180,7 +224,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testOk304_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
@@ -188,7 +232,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testBadStatus_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
@@ -202,7 +246,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testBadStatus_FullProbe() {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
@@ -212,7 +256,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
@@ -225,7 +269,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRedirect_FullProbe() throws URISyntaxException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
@@ -238,7 +282,7 @@ class HttpFetcherImplFetchTest {
|
||||
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
@@ -262,7 +306,7 @@ class HttpFetcherImplFetchTest {
|
||||
|
||||
@Test
|
||||
public void testRangeResponse() throws IOException {
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -279,7 +323,7 @@ class HttpFetcherImplFetchTest {
|
||||
@Test
|
||||
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
@@ -302,7 +346,7 @@ class HttpFetcherImplFetchTest {
|
||||
@Test
|
||||
public void testKeepaliveUrl() {
|
||||
// mostly for smoke testing and debugger utility
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
@@ -319,6 +363,13 @@ class HttpFetcherImplFetchTest {
|
||||
WarcXEntityRefused.register(reader);
|
||||
|
||||
for (var record : reader) {
|
||||
// Load the body, we need to do this before we close the reader to have access to the content.
|
||||
if (record instanceof WarcRequest req) {
|
||||
req.http();
|
||||
} else if (record instanceof WarcResponse rsp) {
|
||||
rsp.http();
|
||||
}
|
||||
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
@@ -1,12 +1,12 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
@Test
|
||||
void run() throws IOException, URISyntaxException {
|
||||
try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
|
||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||
|
||||
try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
|
||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||
}
|
||||
|
||||
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
||||
}
|
||||
|
||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
var req = ClassicRequestBuilder.get(new java.net.URI(url))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
recorder.fetch(httpClient, req);
|
||||
HttpGet request = new HttpGet(url);
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
recorder.fetch(httpClient, new DomainCookies(), request);
|
||||
}
|
||||
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -88,7 +89,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeOk() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -97,7 +98,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeRedir() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -106,7 +107,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeBad() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -115,7 +116,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeTimeout() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
|
@@ -1,11 +1,11 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.WarcReader;
|
||||
import org.netpreserve.jwarc.WarcRequest;
|
||||
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
|
||||
os.write("<html><body>hello</body></html>".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
os.write(":".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@@ -89,24 +89,22 @@ class WarcRecorderFakeServerTest {
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
|
||||
client.close();
|
||||
Files.delete(fileNameWarc);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fetchFast() throws Exception {
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder
|
||||
.get(new java.net.URI("http://localhost:14510/fast"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build()
|
||||
);
|
||||
HttpGet request = new HttpGet("http://localhost:14510/fast");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -127,11 +125,13 @@ class WarcRecorderFakeServerTest {
|
||||
public void fetchSlow() throws Exception {
|
||||
Instant start = Instant.now();
|
||||
|
||||
HttpGet request = new HttpGet("http://localhost:14510/slow");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build(),
|
||||
new DomainCookies(),
|
||||
request,
|
||||
Duration.ofSeconds(1)
|
||||
);
|
||||
Instant end = Instant.now();
|
||||
@@ -149,6 +149,8 @@ class WarcRecorderFakeServerTest {
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println(
|
||||
Files.readString(fileNameWarc));
|
||||
System.out.println(sampleData);
|
||||
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||
|
@@ -2,14 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -41,7 +41,7 @@ class WarcRecorderTest {
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -52,12 +52,12 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient,
|
||||
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build()
|
||||
);
|
||||
|
||||
HttpGet request = new HttpGet("https://www.marginalia.nu/");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -78,8 +78,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -102,8 +103,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
null,
|
||||
@@ -114,8 +116,9 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -138,23 +141,23 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
|
||||
request1.addHeader("User-agent", "test.marginalia.nu");
|
||||
request1.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
client.fetch(httpClient, new DomainCookies(), request1);
|
||||
|
||||
client.fetch(httpClient, ClassicRequestBuilder
|
||||
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build());
|
||||
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
|
||||
request2.addHeader("User-agent", "test.marginalia.nu");
|
||||
request2.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request2);
|
||||
|
||||
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
|
||||
request3.addHeader("User-agent", "test.marginalia.nu");
|
||||
request3.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -31,7 +32,7 @@ class HttpFetcherTest {
|
||||
void fetchUTF8() throws Exception {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
@@ -49,7 +50,7 @@ class HttpFetcherTest {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
|
@@ -3,10 +3,7 @@ package nu.marginalia.crawling.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.fetcher.*;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
@@ -137,7 +134,7 @@ public class CrawlerMockFetcherTest {
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
||||
|
@@ -16,7 +16,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -180,7 +179,7 @@ class CrawlerRetreiverTest {
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2, new BasicCookieStore())
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -456,7 +455,7 @@ class CrawlerRetreiverTest {
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3, new BasicCookieStore())
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -507,7 +506,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||
@@ -519,7 +518,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@NotNull
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -43,7 +44,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.test.IntegrationTestModule;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -121,11 +121,12 @@ public class IntegrationTest {
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new BasicCookieStore())) {
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
new DomainCookies(),
|
||||
"text/html", 200,
|
||||
"""
|
||||
<html>
|
||||
|
Reference in New Issue
Block a user