mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
47 Commits
deploy-010
...
deploy-013
Author | SHA1 | Date | |
---|---|---|---|
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 | ||
|
c67c5bbf42 | ||
|
ecb0e57a1a | ||
|
8c61f61b46 | ||
|
662a18c933 | ||
|
1c2426a052 | ||
|
34df7441ac | ||
|
5387e2bd80 | ||
|
0f3b24d0f8 | ||
|
a732095d2a | ||
|
6607f0112f | ||
|
4913730de9 | ||
|
1db64f9d56 |
@@ -43,12 +43,11 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=23
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
jvmVersion = 24
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:24'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.4'
|
||||
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@@ -14,7 +14,7 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
public EdgeDomain(String host) {
|
||||
public EdgeDomain(@Nonnull String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
host = host.toLowerCase();
|
||||
@@ -61,6 +61,10 @@ public class EdgeDomain implements Serializable {
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
public static String getTopDomain(String host) {
|
||||
return new EdgeDomain(host).topDomain;
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@@ -116,24 +120,6 @@ public class EdgeDomain implements Serializable {
|
||||
return topDomain.substring(0, cutPoint).toLowerCase();
|
||||
}
|
||||
|
||||
public String getLongDomainKey() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
} else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
if (!subDomain.isEmpty() && !"www".equals(subDomain)) {
|
||||
ret.append(":");
|
||||
ret.append(subDomain);
|
||||
}
|
||||
|
||||
return ret.toString().toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
|
@@ -35,21 +35,8 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forExpensiveRequest() {
|
||||
return new RateLimiter(5, 10);
|
||||
}
|
||||
|
||||
public static RateLimiter custom(int perMinute) {
|
||||
return new RateLimiter(perMinute, 60);
|
||||
}
|
||||
|
||||
public static RateLimiter forSpamBots() {
|
||||
return new RateLimiter(120, 3600);
|
||||
}
|
||||
|
||||
|
||||
public static RateLimiter forLogin() {
|
||||
return new RateLimiter(3, 15);
|
||||
return new RateLimiter(4 * perMinute, perMinute);
|
||||
}
|
||||
|
||||
private void cleanIdleBuckets() {
|
||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
||||
}
|
||||
|
||||
private Bucket createBucket() {
|
||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
||||
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||
var bw = Bandwidth.classic(capacity, refill);
|
||||
return Bucket.builder().addLimit(bw).build();
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -13,9 +14,20 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
@@ -5,6 +5,7 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -17,6 +18,17 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
|
@@ -25,7 +25,7 @@ import static org.mockito.Mockito.when;
|
||||
class ZkServiceRegistryTest {
|
||||
private static final int ZOOKEEPER_PORT = 2181;
|
||||
private static final GenericContainer<?> zookeeper =
|
||||
new GenericContainer<>("zookeeper:3.8.0")
|
||||
new GenericContainer<>("zookeeper:3.8")
|
||||
.withExposedPorts(ZOOKEEPER_PORT);
|
||||
|
||||
List<ZkServiceRegistry> registries = new ArrayList<>();
|
||||
|
@@ -33,6 +33,7 @@ import java.sql.SQLException;
|
||||
import java.time.*;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@@ -71,7 +72,7 @@ public class FeedFetcherService {
|
||||
public enum UpdateMode {
|
||||
CLEAN,
|
||||
REFRESH
|
||||
};
|
||||
}
|
||||
|
||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||
if (updating) // Prevent concurrent updates
|
||||
@@ -87,6 +88,7 @@ public class FeedFetcherService {
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||
FeedJournal feedJournal = FeedJournal.create();
|
||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||
) {
|
||||
@@ -131,7 +133,7 @@ public class FeedFetcherService {
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
}
|
||||
@@ -211,6 +213,7 @@ public class FeedFetcherService {
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||
HttpClient client,
|
||||
ExecutorService executorService,
|
||||
@Nullable String ifModifiedSinceDate,
|
||||
@Nullable String ifNoneMatchTag)
|
||||
{
|
||||
@@ -237,7 +240,14 @@ public class FeedFetcherService {
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||
* its support for timeouts only applies to the time until response starts to be received,
|
||||
* and does not catch the case when the server starts to send data but then hangs.
|
||||
*/
|
||||
HttpResponse<byte[]> rs = executorService.submit(
|
||||
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||
.get(15, TimeUnit.SECONDS);
|
||||
|
||||
if (rs.statusCode() == 429) { // Too Many Requests
|
||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||
|
@@ -23,16 +23,33 @@ public class SimpleBlockingThreadPool {
|
||||
private final Logger logger = LoggerFactory.getLogger(SimpleBlockingThreadPool.class);
|
||||
|
||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize) {
|
||||
this(name, poolSize, queueSize, ThreadType.PLATFORM);
|
||||
}
|
||||
|
||||
public SimpleBlockingThreadPool(String name, int poolSize, int queueSize, ThreadType threadType) {
|
||||
tasks = new ArrayBlockingQueue<>(queueSize);
|
||||
|
||||
for (int i = 0; i < poolSize; i++) {
|
||||
Thread worker = new Thread(this::worker, name + "[" + i + "]");
|
||||
worker.setDaemon(true);
|
||||
worker.start();
|
||||
|
||||
Thread.Builder threadBuilder = switch (threadType) {
|
||||
case VIRTUAL -> Thread.ofVirtual();
|
||||
case PLATFORM -> Thread.ofPlatform().daemon(true);
|
||||
};
|
||||
|
||||
Thread worker = threadBuilder
|
||||
.name(name + "[" + i + "]")
|
||||
.start(this::worker);
|
||||
|
||||
workers.add(worker);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public enum ThreadType {
|
||||
VIRTUAL,
|
||||
PLATFORM
|
||||
}
|
||||
|
||||
public void submit(Task task) throws InterruptedException {
|
||||
tasks.put(task);
|
||||
}
|
||||
|
@@ -87,6 +87,8 @@ dependencies {
|
||||
implementation libs.commons.compress
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
||||
try (var recorder = new WarcRecorder(fileName);
|
||||
var db = new DomainStateDb(dbTempFile))
|
||||
{
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||
|
@@ -60,10 +60,14 @@ dependencies {
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@@ -106,9 +103,18 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.blacklist = blacklist;
|
||||
this.node = processConfiguration.node();
|
||||
|
||||
SimpleBlockingThreadPool.ThreadType threadType;
|
||||
if (Boolean.getBoolean("crawler.useVirtualThreads")) {
|
||||
threadType = SimpleBlockingThreadPool.ThreadType.VIRTUAL;
|
||||
}
|
||||
else {
|
||||
threadType = SimpleBlockingThreadPool.ThreadType.PLATFORM;
|
||||
}
|
||||
|
||||
pool = new SimpleBlockingThreadPool("CrawlerPool",
|
||||
Integer.getInteger("crawler.poolSize", 256),
|
||||
1);
|
||||
1,
|
||||
threadType);
|
||||
|
||||
|
||||
// Wait for the blacklist to be loaded before starting the crawl
|
||||
@@ -224,10 +230,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||
|
||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
||||
// crawling the same server in parallel from different subdomains...
|
||||
Collections.shuffle(crawlSpecRecords);
|
||||
crawlSpecRecords.sort(crawlSpecArrangement(crawlSpecRecords));
|
||||
|
||||
// First a validation run to ensure the file is all good to parse
|
||||
if (crawlSpecRecords.isEmpty()) {
|
||||
@@ -306,6 +309,30 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a comparator that sorts the crawl specs in a way that is beneficial for the crawl,
|
||||
* we want to enqueue domains that have common top domains first, but otherwise have a random
|
||||
* order.
|
||||
* <p></p>
|
||||
* Note, we can't use hash codes for randomization as it is not desirable to have the same order
|
||||
* every time the process is restarted (and CrawlSpecRecord is a record, which defines equals and
|
||||
* hashcode based on the fields).
|
||||
* */
|
||||
private Comparator<CrawlSpecRecord> crawlSpecArrangement(List<CrawlSpecRecord> records) {
|
||||
Random r = new Random();
|
||||
Map<String, Integer> topDomainCounts = new HashMap<>(4 + (int) Math.sqrt(records.size()));
|
||||
Map<String, Integer> randomOrder = new HashMap<>(records.size());
|
||||
|
||||
for (var spec : records) {
|
||||
topDomainCounts.merge(EdgeDomain.getTopDomain(spec.domain), 1, Integer::sum);
|
||||
randomOrder.put(spec.domain, r.nextInt());
|
||||
}
|
||||
|
||||
return Comparator.comparing((CrawlSpecRecord spec) -> topDomainCounts.getOrDefault(EdgeDomain.getTopDomain(spec.domain), 0) >= 8)
|
||||
.reversed()
|
||||
.thenComparing(spec -> randomOrder.get(spec.domain))
|
||||
.thenComparing(Record::hashCode); // non-deterministic tie-breaker to
|
||||
}
|
||||
|
||||
/** Submit a task for execution if it can be run, returns true if it was submitted
|
||||
* or if it can be discarded */
|
||||
private boolean trySubmitDeferredTask(CrawlTask task) {
|
||||
@@ -411,7 +438,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference()
|
||||
)
|
||||
@@ -474,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return new CrawlDataReference(slopPath);
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||
}
|
||||
|
||||
|
@@ -11,6 +11,7 @@ import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
@@ -24,6 +25,17 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
private final Connection connection;
|
||||
|
||||
|
||||
public record CrawlMeta(
|
||||
String domainName,
|
||||
Instant lastFullCrawl,
|
||||
Duration recrawlTime,
|
||||
Duration crawlTime,
|
||||
int recrawlErrors,
|
||||
int crawlChanges,
|
||||
int totalCrawlSize
|
||||
) {}
|
||||
|
||||
public record SummaryRecord(
|
||||
String domainName,
|
||||
Instant lastUpdated,
|
||||
@@ -102,6 +114,17 @@ public class DomainStateDb implements AutoCloseable {
|
||||
feedUrl TEXT
|
||||
)
|
||||
""");
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS crawl_meta (
|
||||
domain TEXT PRIMARY KEY,
|
||||
lastFullCrawlEpochMs LONG NOT NULL,
|
||||
recrawlTimeMs LONG NOT NULL,
|
||||
recrawlErrors INTEGER NOT NULL,
|
||||
crawlTimeMs LONG NOT NULL,
|
||||
crawlChanges INTEGER NOT NULL,
|
||||
totalCrawlSize INTEGER NOT NULL
|
||||
)
|
||||
""");
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS favicon (
|
||||
domain TEXT PRIMARY KEY,
|
||||
@@ -164,6 +187,26 @@ public class DomainStateDb implements AutoCloseable {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public void save(CrawlMeta crawlMeta) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO crawl_meta (domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, crawlMeta.domainName());
|
||||
stmt.setLong(2, crawlMeta.lastFullCrawl.toEpochMilli());
|
||||
stmt.setLong(3, crawlMeta.recrawlTime.toMillis());
|
||||
stmt.setInt(4, crawlMeta.recrawlErrors);
|
||||
stmt.setLong(5, crawlMeta.crawlTime.toMillis());
|
||||
stmt.setInt(6, crawlMeta.crawlChanges);
|
||||
stmt.setInt(7, crawlMeta.totalCrawlSize);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to insert crawl meta record", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
if (connection == null) throw new IllegalStateException("No connection to domainstate db");
|
||||
|
||||
@@ -182,7 +225,35 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> get(String domainName) {
|
||||
public Optional<CrawlMeta> getMeta(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
SELECT domain, lastFullCrawlEpochMs, recrawlTimeMs, recrawlErrors, crawlTimeMs, crawlChanges, totalCrawlSize
|
||||
FROM crawl_meta
|
||||
WHERE domain = ?
|
||||
""")) {
|
||||
stmt.setString(1, domainName);
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
return Optional.of(new CrawlMeta(
|
||||
rs.getString("domain"),
|
||||
Instant.ofEpochMilli(rs.getLong("lastFullCrawlEpochMs")),
|
||||
Duration.ofMillis(rs.getLong("recrawlTimeMs")),
|
||||
Duration.ofMillis(rs.getLong("crawlTimeMs")),
|
||||
rs.getInt("recrawlErrors"),
|
||||
rs.getInt("crawlChanges"),
|
||||
rs.getInt("totalCrawlSize")
|
||||
));
|
||||
}
|
||||
} catch (SQLException ex) {
|
||||
logger.error("Failed to get crawl meta record", ex);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public Optional<SummaryRecord> getSummary(String domainName) {
|
||||
if (connection == null)
|
||||
return Optional.empty();
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.net.http.HttpRequest;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(HttpRequest.Builder getBuilder) {
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.header("If-None-Match", etag);
|
||||
request.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.header("If-Modified-Since", lastMod);
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,34 +0,0 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class Cookies extends CookieHandler {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public void clear() {
|
||||
cookieJar.get().clear();
|
||||
}
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookieJar.get().isEmpty();
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||
return cookieJar.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||
cookieJar.get().putAll(responseHeaders);
|
||||
}
|
||||
}
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DomainCookies {
|
||||
private final Map<String, String> cookies = new HashMap<>();
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookies.isEmpty();
|
||||
}
|
||||
|
||||
public void updateCookieStore(HttpResponse response) {
|
||||
for (var header : response.getHeaders()) {
|
||||
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
|
||||
parseCookieHeader(header.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void parseCookieHeader(String value) {
|
||||
// Parse the Set-Cookie header value and extract the cookies
|
||||
|
||||
String[] parts = value.split(";");
|
||||
String cookie = parts[0].trim();
|
||||
|
||||
if (cookie.contains("=")) {
|
||||
String[] cookieParts = cookie.split("=");
|
||||
String name = cookieParts[0].trim();
|
||||
String val = cookieParts[1].trim();
|
||||
cookies.put(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
public void paintRequest(HttpUriRequestBase request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
public void paintRequest(ClassicHttpRequest request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
private String createCookieHeader() {
|
||||
StringJoiner sj = new StringJoiner("; ");
|
||||
for (var cookie : cookies.entrySet()) {
|
||||
sj.add(cookie.getKey() + "=" + cookie.getValue());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
}
|
@@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@@ -15,20 +16,17 @@ import java.util.List;
|
||||
public interface HttpFetcher extends AutoCloseable {
|
||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||
|
||||
Cookies getCookies();
|
||||
CookieStore getCookies();
|
||||
void clearCookies();
|
||||
|
||||
DomainProbeResult probeDomain(EdgeUrl url);
|
||||
|
||||
ContentTypeProbeResult probeContentType(
|
||||
EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags) throws HttpFetcherImpl.RateLimitException;
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws Exception;
|
||||
ProbeType probeType);
|
||||
|
||||
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||
|
||||
@@ -46,6 +44,7 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
|
||||
/** This domain redirects to another domain */
|
||||
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
||||
record RedirectSameDomain_Internal(EdgeUrl domain) implements DomainProbeResult {}
|
||||
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
@@ -56,7 +55,10 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
}
|
||||
|
||||
sealed interface ContentTypeProbeResult {
|
||||
record NoOp() implements ContentTypeProbeResult {}
|
||||
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
||||
record HttpError(int statusCode, String message) implements ContentTypeProbeResult { }
|
||||
record Redirect(EdgeUrl location) implements ContentTypeProbeResult { }
|
||||
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
||||
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||
|
@@ -5,68 +5,167 @@ import com.google.inject.Singleton;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||
import org.apache.hc.core5.http.*;
|
||||
import org.apache.hc.core5.http.io.HttpClientResponseHandler;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.http.HttpTimeoutException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
@Singleton
|
||||
public class HttpFetcherImpl implements HttpFetcher {
|
||||
public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final String userAgentString;
|
||||
private final String userAgentIdentifier;
|
||||
private final Cookies cookies = new Cookies();
|
||||
|
||||
private final CookieStore cookies = new BasicCookieStore();
|
||||
|
||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
private final Marker crawlerAuditMarker = MarkerFactory.getMarker("CRAWLER");
|
||||
|
||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||
}
|
||||
|
||||
private final HttpClient client;
|
||||
private final CloseableHttpClient client;
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private HttpClient createClient() {
|
||||
return HttpClient.newBuilder()
|
||||
.sslContext(NoSecuritySSL.buildSslContext())
|
||||
.cookieHandler(cookies)
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.connectTimeout(Duration.ofSeconds(8))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
public PoolStats getPoolStats() {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setDefaultCookieStore(cookies)
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(this)
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Cookies getCookies() {
|
||||
public CookieStore getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
@@ -78,19 +177,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@Inject
|
||||
public HttpFetcherImpl(UserAgent userAgent)
|
||||
{
|
||||
this.client = createClient();
|
||||
try {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
}
|
||||
|
||||
public HttpFetcherImpl(String userAgent) {
|
||||
this.client = createClient();
|
||||
try {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
}
|
||||
|
||||
// Not necessary in prod, but useful in test
|
||||
public void close() {
|
||||
public void close() throws IOException {
|
||||
client.close();
|
||||
}
|
||||
|
||||
@@ -103,34 +210,94 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
*/
|
||||
@Override
|
||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||
HttpRequest head;
|
||||
try {
|
||||
head = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(probeTimeout)
|
||||
.build();
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
List<EdgeUrl> urls = new ArrayList<>();
|
||||
urls.add(url);
|
||||
|
||||
for (int tries = 0;; tries++) {
|
||||
int redirects = 0;
|
||||
AtomicBoolean tryGet = new AtomicBoolean(false);
|
||||
|
||||
while (!urls.isEmpty() && ++redirects < 5) {
|
||||
ClassicHttpRequest request;
|
||||
|
||||
EdgeUrl topUrl = urls.removeFirst();
|
||||
try {
|
||||
var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||
if (tryGet.get()) {
|
||||
request = ClassicRequestBuilder.get(topUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Range", "bytes=0-255")
|
||||
.build();
|
||||
} else {
|
||||
request = ClassicRequestBuilder.head(topUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
}
|
||||
return new DomainProbeResult.Ok(rspUri);
|
||||
} catch (Exception ex) {
|
||||
if (tries > 3) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
}
|
||||
// else try again ...
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
|
||||
try {
|
||||
var result = SendLock.wrapSend(client, request, response -> {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
|
||||
return switch (response.getCode()) {
|
||||
case 200 -> new DomainProbeResult.Ok(url);
|
||||
case 405 -> {
|
||||
if (!tryGet.get()) {
|
||||
tryGet.set(true);
|
||||
yield new DomainProbeResult.RedirectSameDomain_Internal(url);
|
||||
}
|
||||
else {
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status 405, tried HEAD and GET?!");
|
||||
}
|
||||
}
|
||||
case 301, 302, 307 -> {
|
||||
var location = response.getFirstHeader("Location");
|
||||
|
||||
if (location != null) {
|
||||
Optional<EdgeUrl> newUrl = linkParser.parseLink(topUrl, location.getValue());
|
||||
if (newUrl.isEmpty()) {
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid location header on redirect");
|
||||
}
|
||||
EdgeUrl newEdgeUrl = newUrl.get();
|
||||
if (newEdgeUrl.domain.equals(topUrl.domain)) {
|
||||
yield new DomainProbeResult.RedirectSameDomain_Internal(newEdgeUrl);
|
||||
}
|
||||
else {
|
||||
yield new DomainProbeResult.Redirect(newEdgeUrl.domain);
|
||||
}
|
||||
}
|
||||
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "No location header on redirect");
|
||||
|
||||
}
|
||||
default ->
|
||||
new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status " + response.getCode());
|
||||
};
|
||||
});
|
||||
|
||||
if (result instanceof DomainProbeResult.RedirectSameDomain_Internal(EdgeUrl redirUrl)) {
|
||||
urls.add(redirUrl);
|
||||
}
|
||||
else {
|
||||
return result;
|
||||
}
|
||||
|
||||
// We don't have robots.txt yet, so we'll assume a request delay of 1 second
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Failed to resolve domain root");
|
||||
|
||||
}
|
||||
|
||||
/** Perform a HEAD request to fetch the content type of a URL.
|
||||
@@ -141,70 +308,73 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
* recorded in the WARC file on failure.
|
||||
*/
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags tags) throws RateLimitException {
|
||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
|
||||
try {
|
||||
var headBuilder = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||
var headers = rsp.headers();
|
||||
|
||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||
|
||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
||||
|
||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
||||
}
|
||||
|
||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||
|
||||
// HEAD 301 url1 -> url2
|
||||
// HEAD 200 url2
|
||||
// GET 301 url1 -> url2
|
||||
// GET 200 url2
|
||||
|
||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||
|
||||
var redirectUrl = new EdgeUrl(rsp.uri());
|
||||
EdgeUrl ret;
|
||||
|
||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||
else ret = url;
|
||||
|
||||
// Intercept rate limiting
|
||||
if (rsp.statusCode() == 429) {
|
||||
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
||||
}
|
||||
|
||||
return new ContentTypeProbeResult.Ok(ret);
|
||||
}
|
||||
catch (HttpTimeoutException ex) {
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
|
||||
return new ContentTypeProbeResult.Exception(ex);
|
||||
}
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags) {
|
||||
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
return new ContentTypeProbeResult.NoOp();
|
||||
}
|
||||
|
||||
try {
|
||||
ClassicHttpRequest head = ClassicRequestBuilder.head(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
cookies.paintRequest(head);
|
||||
|
||||
return SendLock.wrapSend(client, head, (rsp) -> {
|
||||
cookies.updateCookieStore(rsp);
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
int statusCode = rsp.getCode();
|
||||
|
||||
// Handle redirects
|
||||
if (statusCode == 301 || statusCode == 302 || statusCode == 307) {
|
||||
var location = rsp.getFirstHeader("Location");
|
||||
if (location != null) {
|
||||
Optional<EdgeUrl> newUrl = linkParser.parseLink(url, location.getValue());
|
||||
if (newUrl.isEmpty())
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Invalid location header on redirect");
|
||||
return new ContentTypeProbeResult.Redirect(newUrl.get());
|
||||
}
|
||||
}
|
||||
|
||||
if (statusCode == 405) {
|
||||
// If we get a 405, we can't probe the content type with HEAD, so we'll just say it's ok
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
// Handle errors
|
||||
if (statusCode < 200 || statusCode > 300) {
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Bad status code");
|
||||
}
|
||||
|
||||
// Handle missing content type
|
||||
var ctHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (ctHeader == null) {
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Missing Content-Type header");
|
||||
}
|
||||
var contentType = ctHeader.getValue();
|
||||
|
||||
// Check if the content type is allowed
|
||||
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
} else {
|
||||
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new ContentTypeProbeResult.Exception(ex);
|
||||
}
|
||||
finally {
|
||||
timer.waitFetchDelay();
|
||||
}
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
/** Fetch the content of a URL, and record it in a WARC file,
|
||||
@@ -214,38 +384,74 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
throws Exception
|
||||
{
|
||||
var getBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept-Language", "en,*;q=0.5")
|
||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 429) {
|
||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||
}
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
if (ok.statusCode() == 200) {
|
||||
return ok;
|
||||
try {
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
try {
|
||||
var probeResult = probeContentType(url, cookies, timer, contentTags);
|
||||
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||
break; //
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break;
|
||||
case ContentTypeProbeResult.BadContentType badContentType:
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.Exception(Exception ex):
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.HttpError httpError:
|
||||
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||
case ContentTypeProbeResult.Redirect redirect:
|
||||
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return result;
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
|
||||
contentTags.paint(request);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
}
|
||||
|
||||
switch (result) {
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
||||
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
|
||||
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ex.printStackTrace();
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -311,68 +517,66 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||
if (response.statusCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
Document parsedSitemap;
|
||||
|
||||
try (InputStream inputStream = response.body()) {
|
||||
InputStream parserStream;
|
||||
if (sitemapUrl.path.endsWith(".gz")) {
|
||||
parserStream = new GZIPInputStream(inputStream);
|
||||
} else {
|
||||
parserStream = inputStream;
|
||||
}
|
||||
|
||||
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||
}
|
||||
finally {
|
||||
sl.close();
|
||||
}
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
return client.execute(getRequest, response -> {
|
||||
try {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
finally {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error while fetching sitemap {}: {} ({})", sitemapUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -397,15 +601,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try (var sl = new SendLock()) {
|
||||
var getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout);
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
|
||||
|
||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||
robotsParser.parseContent(url.toString(),
|
||||
@@ -419,6 +621,59 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
|
||||
return false;
|
||||
}
|
||||
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount <= 3;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
String retryAfter = response.getFirstHeader("Retry-After").getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
public static class RateLimitException extends Exception {
|
||||
private final String retryAfter;
|
||||
@@ -451,9 +706,10 @@ class SendLock implements AutoCloseable {
|
||||
maxConcurrentRequests.acquireUninterruptibly();
|
||||
}
|
||||
|
||||
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
|
||||
public static <T> T wrapSend(HttpClient client, final ClassicHttpRequest request,
|
||||
final HttpClientResponseHandler<? extends T> responseHandler) throws IOException {
|
||||
try (var lock = new SendLock()) {
|
||||
return client.send(request, handler);
|
||||
return client.execute(request, responseHandler);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,15 +1,20 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static nu.marginalia.crawl.fetcher.warc.ErrorBuffer.suppressContentEncoding;
|
||||
|
||||
/** Input buffer for temporary storage of a HTTP response
|
||||
* This may be in-memory or on-disk, at the discretion of
|
||||
@@ -17,9 +22,9 @@ import java.util.zip.GZIPInputStream;
|
||||
* */
|
||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
protected HttpHeaders headers;
|
||||
protected Header[] headers;
|
||||
|
||||
WarcInputBuffer(HttpHeaders headers) {
|
||||
WarcInputBuffer(Header[] headers) {
|
||||
this.headers = headers;
|
||||
}
|
||||
|
||||
@@ -31,7 +36,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
|
||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||
|
||||
public final HttpHeaders headers() { return headers; }
|
||||
public final Header[] headers() { return headers; }
|
||||
|
||||
/** Create a buffer for a response.
|
||||
* If the response is small and not compressed, it will be stored in memory.
|
||||
@@ -39,34 +44,52 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
* and suppressed from the headers.
|
||||
* If an error occurs, a buffer will be created with no content and an error status.
|
||||
*/
|
||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
||||
if (rsp == null)
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||
HttpGet request,
|
||||
Duration timeLimit) throws IOException {
|
||||
if (response == null)
|
||||
return new ErrorBuffer();
|
||||
|
||||
var headers = rsp.headers();
|
||||
|
||||
try (var is = rsp.body()) {
|
||||
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
||||
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
||||
var entity = response.getEntity();
|
||||
|
||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||
if (null == entity) {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
|
||||
if (length > 0 && length < 8192) {
|
||||
// If the content is small and not compressed, we can just read it into memory
|
||||
return new MemoryBuffer(headers, is, contentLength);
|
||||
}
|
||||
else {
|
||||
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||
} else {
|
||||
// Otherwise, we unpack it into a file and read it from there
|
||||
return new FileBuffer(headers, is);
|
||||
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new ErrorBuffer();
|
||||
finally {
|
||||
try {
|
||||
is.skip(Long.MAX_VALUE);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
}
|
||||
finally {
|
||||
// Close the input stream
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||
protected void copy(InputStream is, OutputStream os) {
|
||||
long startTime = System.currentTimeMillis();
|
||||
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||
Instant start = Instant.now();
|
||||
Instant timeout = start.plus(timeLimit);
|
||||
long size = 0;
|
||||
|
||||
byte[] buffer = new byte[8192];
|
||||
@@ -76,24 +99,105 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||
if (remaining.isNegative()) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
// Abort the request if the time limit is exceeded
|
||||
// so we don't keep the connection open forever or are forced to consume
|
||||
// the stream to the end
|
||||
|
||||
request.abort();
|
||||
break;
|
||||
}
|
||||
|
||||
int n = is.read(buffer);
|
||||
|
||||
if (n < 0) break;
|
||||
size += n;
|
||||
os.write(buffer, 0, n);
|
||||
|
||||
if (size > WarcRecorder.MAX_SIZE) {
|
||||
// Even if we've exceeded the max length,
|
||||
// we keep consuming the stream up until the end or a timeout,
|
||||
// as closing the stream means resetting the connection, and
|
||||
// that's generally not desirable.
|
||||
|
||||
if (size < WarcRecorder.MAX_SIZE) {
|
||||
os.write(buffer, 0, n);
|
||||
}
|
||||
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
break;
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
break;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Takes a Content-Range header and checks if it is complete.
|
||||
* A complete range is one that covers the entire resource.
|
||||
* For example, "bytes 0-1023/2048" or "bytes 0-1023/*" are complete ranges.
|
||||
* "bytes 0-1023/2048" is not a complete range.
|
||||
*/
|
||||
public boolean isRangeComplete(Header[] headers) {
|
||||
// Find the Content-Range header
|
||||
String contentRangeHeader = null;
|
||||
for (var header : headers) {
|
||||
if ("Content-Range".equalsIgnoreCase(header.getName())) {
|
||||
contentRangeHeader = header.getValue();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if header is null or empty
|
||||
if (contentRangeHeader == null || contentRangeHeader.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Content-Range format: "bytes range-start-range-end/size"
|
||||
// e.g., "bytes 0-1023/2048" or "bytes 0-1023/*"
|
||||
|
||||
// Get the part after "bytes "
|
||||
String[] parts = contentRangeHeader.split(" ", 2);
|
||||
if (parts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the range and size parts (e.g., "0-1023/2048")
|
||||
String rangeAndSize = parts[1];
|
||||
String[] rangeAndSizeParts = rangeAndSize.split("/", 2);
|
||||
if (rangeAndSizeParts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the range (e.g., "0-1023")
|
||||
String range = rangeAndSizeParts[0];
|
||||
String[] rangeParts = range.split("-", 2);
|
||||
if (rangeParts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the size (e.g., "2048" or "*")
|
||||
String size = rangeAndSizeParts[1];
|
||||
|
||||
// If size is "*", we don't know the total size, so return false
|
||||
if ("*".equals(size)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse as long to handle large files
|
||||
long rangeStart = Long.parseLong(rangeParts[0]);
|
||||
long rangeEnd = Long.parseLong(rangeParts[1]);
|
||||
long totalSize = Long.parseLong(size);
|
||||
|
||||
// Check if the range covers the entire resource
|
||||
return rangeStart == 0 && rangeEnd == totalSize - 1;
|
||||
|
||||
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -101,7 +205,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
/** Pseudo-buffer for when we have an error */
|
||||
class ErrorBuffer extends WarcInputBuffer {
|
||||
public ErrorBuffer() {
|
||||
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
||||
super(new Header[0]);
|
||||
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
@@ -118,17 +222,29 @@ class ErrorBuffer extends WarcInputBuffer {
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {}
|
||||
|
||||
|
||||
static Header[] suppressContentEncoding(Header[] headers) {
|
||||
return Arrays.stream(headers).filter(header -> !"Content-Encoding".equalsIgnoreCase(header.getName())).toArray(Header[]::new);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Buffer for when we have the response in memory */
|
||||
class MemoryBuffer extends WarcInputBuffer {
|
||||
byte[] data;
|
||||
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
||||
super(headers);
|
||||
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
} else {
|
||||
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
}
|
||||
|
||||
var outputStream = new ByteArrayOutputStream(size);
|
||||
|
||||
copy(responseStream, outputStream);
|
||||
copy(responseStream, request, outputStream, timeLimit);
|
||||
|
||||
data = outputStream.toByteArray();
|
||||
}
|
||||
@@ -152,40 +268,25 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
class FileBuffer extends WarcInputBuffer {
|
||||
private final Path tempFile;
|
||||
|
||||
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
||||
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
} else {
|
||||
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
}
|
||||
|
||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||
|
||||
|
||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(new GZIPInputStream(responseStream), out);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, request, out, timeLimit);
|
||||
}
|
||||
else {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, out);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
}
|
||||
|
||||
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
||||
return HttpHeaders.of(headers.map(), (k, v) -> {
|
||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
||||
return false;
|
||||
}
|
||||
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public InputStream read() throws IOException {
|
||||
return Files.newInputStream(tempFile);
|
||||
}
|
||||
|
@@ -1,6 +1,8 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
@@ -17,7 +19,7 @@ import java.util.stream.Collectors;
|
||||
public class WarcProtocolReconstructor {
|
||||
|
||||
static String getHttpRequestString(String method,
|
||||
Map<String, List<String>> mainHeaders,
|
||||
Header[] mainHeaders,
|
||||
Map<String, List<String>> extraHeaders,
|
||||
URI uri) {
|
||||
StringBuilder requestStringBuilder = new StringBuilder();
|
||||
@@ -34,12 +36,13 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
Set<String> addedHeaders = new HashSet<>();
|
||||
|
||||
mainHeaders.forEach((k, values) -> {
|
||||
for (var value : values) {
|
||||
addedHeaders.add(k);
|
||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
||||
}
|
||||
});
|
||||
for (var header : mainHeaders) {
|
||||
String k = header.getName();
|
||||
String v = header.getValue();
|
||||
|
||||
addedHeaders.add(k);
|
||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(v).append("\r\n");
|
||||
}
|
||||
|
||||
extraHeaders.forEach((k, values) -> {
|
||||
if (!addedHeaders.contains(k)) {
|
||||
@@ -87,6 +90,12 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
||||
Map.entry(200, "OK"),
|
||||
Map.entry(201, "Created"),
|
||||
@@ -149,6 +158,37 @@ public class WarcProtocolReconstructor {
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
String headerCapitalized = capitalizeHeader(header.getName());
|
||||
|
||||
// Omit pseudoheaders injected by the crawler itself
|
||||
if (headerCapitalized.startsWith("X-Marginalia"))
|
||||
continue;
|
||||
|
||||
// Omit Transfer-Encoding and Content-Encoding headers
|
||||
if (headerCapitalized.equals("Transfer-Encoding"))
|
||||
continue;
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
continue;
|
||||
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
|
@@ -1,11 +1,16 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.NameValuePair;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
import org.slf4j.Logger;
|
||||
@@ -14,10 +19,9 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@@ -48,22 +52,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Affix a version string in case we need to change the format in the future
|
||||
// in some way
|
||||
private final String warcRecorderVersion = "1.0";
|
||||
private final Cookies cookies;
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
/**
|
||||
* Create a new WarcRecorder that will write to the given file
|
||||
*
|
||||
* @param warcFile The file to write to
|
||||
*/
|
||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||
public WarcRecorder(Path warcFile) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = fetcher.getCookies();
|
||||
}
|
||||
|
||||
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -73,16 +70,25 @@ public class WarcRecorder implements AutoCloseable {
|
||||
public WarcRecorder() throws IOException {
|
||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||
this.writer = new WarcWriter(this.warcFile);
|
||||
this.cookies = new Cookies();
|
||||
|
||||
temporaryFile = true;
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
java.net.http.HttpRequest request)
|
||||
DomainCookies cookies,
|
||||
HttpGet request)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
URI requestUri = request.uri();
|
||||
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
DomainCookies cookies,
|
||||
HttpGet request,
|
||||
Duration timeout)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
URI requestUri = request.getUri();
|
||||
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -90,121 +96,151 @@ public class WarcRecorder implements AutoCloseable {
|
||||
Instant date = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.headers().map());
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
|
||||
HttpResponse<InputStream> response;
|
||||
// Inject a range header to attempt to limit the size of the response
|
||||
// to the maximum size we want to store, if the server supports it.
|
||||
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||
cookies.paintRequest(request);
|
||||
try {
|
||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return client.execute(request,response -> {
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
byte[] httpRequestString = WarcProtocolReconstructor
|
||||
.getHttpRequestString(
|
||||
request.getMethod(),
|
||||
request.getHeaders(),
|
||||
extraHeaders,
|
||||
requestUri)
|
||||
.getBytes();
|
||||
|
||||
requestDigestBuilder.update(httpRequestString);
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
responseDataBuffer.put(responseHeaders);
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||
|
||||
int dataStart = responseDataBuffer.pos();
|
||||
|
||||
for (;;) {
|
||||
int remainingLength = responseDataBuffer.remaining();
|
||||
if (remainingLength == 0)
|
||||
break;
|
||||
|
||||
int startPos = responseDataBuffer.pos();
|
||||
|
||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||
if (n < 0)
|
||||
break;
|
||||
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||
}
|
||||
|
||||
// with some http client libraries, that resolve redirects transparently, this might be different
|
||||
// from the request URI, but currently we don't have transparent redirect resolution so it's always
|
||||
// the same (though let's keep the variables separate in case this changes)
|
||||
final URI responseUri = requestUri;
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||
responseBuilder.ipAddress(inetAddress);
|
||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||
|
||||
// Build and write the response
|
||||
|
||||
var warcResponse = responseBuilder.build();
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
if (response.getCode() == 301 || response.getCode() == 302 || response.getCode() == 307) {
|
||||
// If the server responds with a redirect, we need to
|
||||
// update the request URI to the new location
|
||||
EdgeUrl redirectLocation = Optional.ofNullable(response.getFirstHeader("Location"))
|
||||
.map(NameValuePair::getValue)
|
||||
.flatMap(location -> linkParser.parseLink(new EdgeUrl(requestUri), location))
|
||||
.orElse(null);
|
||||
if (redirectLocation != null) {
|
||||
// If the redirect location is a valid URL, we need to update the request URI
|
||||
return new HttpFetchResult.ResultRedirect(redirectLocation);
|
||||
} else {
|
||||
// If the redirect location is not a valid URL, we need to throw an exception
|
||||
return new HttpFetchResult.ResultException(new IOException("Invalid redirect location: " + response.getFirstHeader("Location")));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.getCode(),
|
||||
inputBuffer.headers(),
|
||||
inetAddress.getHostAddress(),
|
||||
responseDataBuffer.data,
|
||||
dataStart,
|
||||
responseDataBuffer.length() - dataStart);
|
||||
} catch (Exception ex) {
|
||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
});
|
||||
// the client.execute() method will throw an exception if the request times out
|
||||
// or on other IO exceptions, so we need to catch those here as well as having
|
||||
// exception handling in the response handler
|
||||
} catch (SocketTimeoutException ex) {
|
||||
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response);
|
||||
InputStream inputStream = inputBuffer.read())
|
||||
{
|
||||
if (cookies.hasCookies()) {
|
||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
responseDataBuffer.put(responseHeaders);
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||
|
||||
int dataStart = responseDataBuffer.pos();
|
||||
|
||||
for (;;) {
|
||||
int remainingLength = responseDataBuffer.remaining();
|
||||
if (remainingLength == 0)
|
||||
break;
|
||||
|
||||
int startPos = responseDataBuffer.pos();
|
||||
|
||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||
if (n < 0)
|
||||
break;
|
||||
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||
}
|
||||
|
||||
// It looks like this might be the same as requestUri, but it's not;
|
||||
// it's the URI after resolving redirects.
|
||||
final URI responseUri = response.uri();
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||
responseBuilder.ipAddress(inetAddress);
|
||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||
|
||||
// Build and write the response
|
||||
|
||||
var warcResponse = responseBuilder.build();
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
byte[] httpRequestString = WarcProtocolReconstructor
|
||||
.getHttpRequestString(
|
||||
response.request().method(),
|
||||
response.request().headers().map(),
|
||||
extraHeaders,
|
||||
requestUri)
|
||||
.getBytes();
|
||||
|
||||
requestDigestBuilder.update(httpRequestString);
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.concurrentTo(warcResponse.id())
|
||||
.build();
|
||||
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.statusCode(),
|
||||
inputBuffer.headers(),
|
||||
inetAddress.getHostAddress(),
|
||||
responseDataBuffer.data,
|
||||
dataStart,
|
||||
responseDataBuffer.length() - dataStart);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (IOException ex) {
|
||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
@@ -214,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -275,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
.date(Instant.now())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
|
||||
builder.addHeader("X-Has-Cookies", "1");
|
||||
}
|
||||
|
||||
@@ -295,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
||||
@@ -316,6 +352,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||
break;
|
||||
case HttpFetcher.DomainProbeResult.RedirectSameDomain_Internal redirectSameDomain:
|
||||
fields.put("X-WARC-Probe-Status", List.of("REDIR-INTERNAL"));
|
||||
break;
|
||||
}
|
||||
|
||||
var warcinfo = new Warcinfo.Builder()
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@@ -50,15 +51,20 @@ public class CrawlDelayTimer {
|
||||
waitFetchDelay(0);
|
||||
}
|
||||
|
||||
public void waitFetchDelay(Duration spentTime) {
|
||||
waitFetchDelay(spentTime.toMillis());
|
||||
}
|
||||
|
||||
public void waitFetchDelay(long spentTime) {
|
||||
long sleepTime = delayTime;
|
||||
|
||||
long jitter = ThreadLocalRandom.current().nextLong(0, 150);
|
||||
try {
|
||||
if (sleepTime >= 1) {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000));
|
||||
Thread.sleep(min(sleepTime - spentTime, 5000) + jitter);
|
||||
} else {
|
||||
// When no crawl delay is specified, lean toward twice the fetch+process time,
|
||||
// within sane limits. This means slower servers get slower crawling, and faster
|
||||
@@ -71,17 +77,17 @@ public class CrawlDelayTimer {
|
||||
if (spentTime > sleepTime)
|
||||
return;
|
||||
|
||||
Thread.sleep(sleepTime - spentTime);
|
||||
Thread.sleep(sleepTime - spentTime + jitter);
|
||||
}
|
||||
|
||||
if (slowDown) {
|
||||
// Additional delay when the server is signalling it wants slower requests
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS);
|
||||
Thread.sleep(DEFAULT_CRAWL_DELAY_MIN_MS + jitter);
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new RuntimeException();
|
||||
throw new RuntimeException("Interrupted", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,42 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* This class is used to stagger the rate at which connections are created.
|
||||
* <p></p>
|
||||
* It is used to ensure that we do not create too many connections at once,
|
||||
* which can lead to network congestion and other issues. Since the connections
|
||||
* tend to be very long-lived, we can afford to wait a bit before creating the next
|
||||
* even if it adds a bit of build-up time when the crawl starts.
|
||||
*/
|
||||
public class CrawlerConnectionThrottle {
|
||||
private Instant lastCrawlStart = Instant.EPOCH;
|
||||
private final Semaphore launchSemaphore = new Semaphore(1);
|
||||
|
||||
private final Duration launchInterval;
|
||||
|
||||
public CrawlerConnectionThrottle(Duration launchInterval) {
|
||||
this.launchInterval = launchInterval;
|
||||
}
|
||||
|
||||
public void waitForConnectionPermission() throws InterruptedException {
|
||||
try {
|
||||
launchSemaphore.acquire();
|
||||
Instant nextPermittedLaunch = lastCrawlStart.plus(launchInterval);
|
||||
|
||||
if (nextPermittedLaunch.isAfter(Instant.now())) {
|
||||
long waitTime = Duration.between(Instant.now(), nextPermittedLaunch).toMillis();
|
||||
TimeUnit.MILLISECONDS.sleep(waitTime);
|
||||
}
|
||||
|
||||
lastCrawlStart = Instant.now();
|
||||
}
|
||||
finally {
|
||||
launchSemaphore.release();
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,8 +6,8 @@ import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
@@ -26,14 +26,16 @@ import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
private static final int MAX_ERRORS = 20;
|
||||
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
|
||||
|
||||
private final HttpFetcher fetcher;
|
||||
|
||||
@@ -50,6 +52,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private final DomainStateDb domainStateDb;
|
||||
private final WarcRecorder warcRecorder;
|
||||
private final CrawlerRevisitor crawlerRevisitor;
|
||||
private final DomainCookies cookies = new DomainCookies();
|
||||
|
||||
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||
);
|
||||
|
||||
int errorCount = 0;
|
||||
|
||||
@@ -90,6 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
try (oldCrawlData) {
|
||||
|
||||
// Wait for permission to open a connection to avoid network congestion
|
||||
// from hundreds/thousands of TCP handshakes
|
||||
connectionThrottle.waitForConnectionPermission();
|
||||
|
||||
// Do an initial domain probe to determine the root URL
|
||||
var probeResult = probeRootUrl();
|
||||
|
||||
@@ -108,15 +120,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||
domainStateDb.save(summaryRecord);
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
// There's a small chance we're interrupted during the sniffing portion
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
Instant recrawlStart = Instant.now();
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
|
||||
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||
if (recrawlMetadata.size() > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
|
||||
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||
|
||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
||||
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks, recrawlMetadata, recrawlTime);
|
||||
}
|
||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||
@@ -126,6 +147,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||
yield 1;
|
||||
}
|
||||
default -> {
|
||||
logger.error("Unexpected domain probe result {}", probeResult);
|
||||
yield 1;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
@@ -138,17 +163,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private int crawlDomain(EdgeUrl rootUrl,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer,
|
||||
DomainLinks domainLinks) {
|
||||
DomainLinks domainLinks,
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata,
|
||||
Duration recrawlTime) {
|
||||
|
||||
Instant crawlStart = Instant.now();
|
||||
|
||||
// Add external links to the crawl frontier
|
||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||
|
||||
// Fetch sitemaps
|
||||
for (var sitemap : robotsRules.getSitemaps()) {
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
|
||||
// Validate the sitemap URL and check if it belongs to the domain as the root URL
|
||||
if (EdgeUrl.parse(sitemap)
|
||||
.map(url -> url.getDomain().equals(rootUrl.domain))
|
||||
.orElse(false)) {
|
||||
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
}
|
||||
}
|
||||
|
||||
int crawlerAdditions = 0;
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
&& errorCount < MAX_ERRORS
|
||||
@@ -180,7 +217,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
continue;
|
||||
|
||||
try {
|
||||
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||
var result = fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||
|
||||
if (result.isOk()) {
|
||||
crawlerAdditions++;
|
||||
}
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
@@ -188,6 +229,17 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
Duration crawlTime = Duration.between(crawlStart, Instant.now());
|
||||
domainStateDb.save(new DomainStateDb.CrawlMeta(
|
||||
domain,
|
||||
Instant.now(),
|
||||
recrawlTime,
|
||||
crawlTime,
|
||||
recrawlMetadata.errors(),
|
||||
crawlerAdditions,
|
||||
recrawlMetadata.size() + crawlerAdditions
|
||||
));
|
||||
|
||||
return crawlFrontier.visitedSize();
|
||||
}
|
||||
|
||||
@@ -216,17 +268,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return domainProbeResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
Optional<String> feedLink = Optional.empty();
|
||||
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||
if (Objects.equals(location.domain, url.domain)) {
|
||||
// TODO: Follow the redirect to the new location and sniff the document
|
||||
crawlFrontier.addFirst(location);
|
||||
}
|
||||
|
||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||
}
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||
}
|
||||
|
||||
var optDoc = ok.parseDocument();
|
||||
if (optDoc.isEmpty())
|
||||
@@ -275,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
@@ -289,6 +353,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error configuring link filter", ex);
|
||||
if (Thread.interrupted()) {
|
||||
Thread.currentThread().interrupt();
|
||||
return DomainStateDb.SummaryRecord.forError(domain, "Crawler Interrupted", ex.getMessage());
|
||||
}
|
||||
}
|
||||
finally {
|
||||
crawlFrontier.addVisited(rootUrl);
|
||||
@@ -316,7 +384,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
);
|
||||
|
||||
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||
var oldDomainStateRecord = domainStateDb.get(domain);
|
||||
var oldDomainStateRecord = domainStateDb.getSummary(domain);
|
||||
|
||||
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||
if (oldDomainStateRecord.isPresent()) {
|
||||
@@ -341,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (parsedOpt.isEmpty())
|
||||
return false;
|
||||
|
||||
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
@@ -367,112 +435,63 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
CrawlDelayTimer timer,
|
||||
DocumentWithReference reference) throws InterruptedException
|
||||
{
|
||||
logger.debug("Fetching {}", top);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
var contentTags = reference.getContentTags();
|
||||
|
||||
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
timer.waitFetchDelay();
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
// Parse the document and enqueue links
|
||||
try {
|
||||
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
|
||||
var docOpt = ok.parseDocument();
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
switch (fetchedDoc) {
|
||||
case HttpFetchResult.ResultOk ok -> {
|
||||
var docOpt = ok.parseDocument();
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
var doc = reference.doc();
|
||||
case HttpFetchResult.Result304Raw ref when reference.doc() != null ->
|
||||
{
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
doc.documentBodyBytes);
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
doc.documentBodyBytes);
|
||||
|
||||
if (doc.documentBodyBytes != null) {
|
||||
var parsed = doc.parseBody();
|
||||
if (doc.documentBodyBytes != null) {
|
||||
var parsed = doc.parseBody();
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||
crawlFrontier.addVisited(top);
|
||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||
crawlFrontier.addVisited(top);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
|
||||
errorCount ++;
|
||||
case HttpFetchResult.ResultRedirect(EdgeUrl location) -> {
|
||||
if (Objects.equals(location.domain, top.domain)) {
|
||||
crawlFrontier.addFirst(location);
|
||||
}
|
||||
}
|
||||
case HttpFetchResult.ResultException ex -> errorCount++;
|
||||
default -> {} // Ignore other types
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error parsing document {}", top, ex);
|
||||
}
|
||||
|
||||
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
|
||||
|
||||
return fetchedDoc;
|
||||
}
|
||||
|
||||
/** Fetch a document and retry on 429s */
|
||||
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
||||
CrawlDelayTimer timer,
|
||||
HttpFetcher.ProbeType probeType,
|
||||
ContentTags contentTags) throws InterruptedException {
|
||||
|
||||
long probeStart = System.currentTimeMillis();
|
||||
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
retryLoop:
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
||||
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break retryLoop;
|
||||
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
||||
return new HttpFetchResult.ResultException(timeout.ex());
|
||||
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
||||
return new HttpFetchResult.ResultException(exception.ex());
|
||||
default: // should be unreachable
|
||||
throw new IllegalStateException("Unknown probe result");
|
||||
}
|
||||
}
|
||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||
}
|
||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultNone();
|
||||
}
|
||||
|
||||
private boolean isAllowedProtocol(String proto) {
|
||||
return proto.equalsIgnoreCase("http")
|
||||
|| proto.equalsIgnoreCase("https");
|
||||
|
@@ -55,6 +55,9 @@ public class DomainCrawlFrontier {
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return thisDomain;
|
||||
}
|
||||
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
||||
* than the number of already visited documents, the base depth will be adjusted
|
||||
* to the visited count first.
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -10,6 +11,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@@ -18,10 +21,13 @@ import java.io.IOException;
|
||||
* E-Tag and Last-Modified headers.
|
||||
*/
|
||||
public class CrawlerRevisitor {
|
||||
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
private final CrawlerRetreiver crawlerRetreiver;
|
||||
private final WarcRecorder warcRecorder;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRevisitor.class);
|
||||
|
||||
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
||||
CrawlerRetreiver crawlerRetreiver,
|
||||
WarcRecorder warcRecorder) {
|
||||
@@ -31,7 +37,8 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
|
||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||
public int recrawl(CrawlDataReference oldCrawlData,
|
||||
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||
DomainCookies cookies,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer)
|
||||
throws InterruptedException {
|
||||
@@ -39,6 +46,7 @@ public class CrawlerRevisitor {
|
||||
int retained = 0;
|
||||
int errors = 0;
|
||||
int skipped = 0;
|
||||
int size = 0;
|
||||
|
||||
for (CrawledDocument doc : oldCrawlData) {
|
||||
if (errors > 20) {
|
||||
@@ -46,6 +54,10 @@ public class CrawlerRevisitor {
|
||||
break;
|
||||
}
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||
if (urlMaybe.isEmpty())
|
||||
continue;
|
||||
@@ -78,6 +90,7 @@ public class CrawlerRevisitor {
|
||||
continue;
|
||||
}
|
||||
|
||||
size++;
|
||||
|
||||
double skipProb;
|
||||
|
||||
@@ -121,6 +134,7 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
// Add a WARC record so we don't repeat this
|
||||
warcRecorder.writeReferenceCopy(url,
|
||||
cookies,
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBodyBytes,
|
||||
@@ -145,11 +159,15 @@ public class CrawlerRevisitor {
|
||||
else if (result instanceof HttpFetchResult.ResultException) {
|
||||
errors++;
|
||||
}
|
||||
|
||||
recrawled++;
|
||||
}
|
||||
}
|
||||
|
||||
return recrawled;
|
||||
logger.info("Recrawl summary {}: {} recrawled, {} retained, {} errors, {} skipped",
|
||||
crawlFrontier.getDomain(), recrawled, retained, errors, skipped);
|
||||
|
||||
return new RecrawlMetadata(size, errors, skipped);
|
||||
}
|
||||
|
||||
public record RecrawlMetadata(int size, int errors, int skipped) {}
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Objects;
|
||||
|
||||
public record DocumentWithReference(
|
||||
@Nullable CrawledDocument doc,
|
||||
@@ -33,8 +34,22 @@ public record DocumentWithReference(
|
||||
return false;
|
||||
if (doc == null)
|
||||
return false;
|
||||
if (doc.documentBodyBytes.length == 0)
|
||||
return false;
|
||||
if (doc.documentBodyBytes.length == 0) {
|
||||
if (doc.httpStatus < 300) {
|
||||
return resultOk.bytesLength() == 0;
|
||||
}
|
||||
else if (doc.httpStatus == 301 || doc.httpStatus == 302 || doc.httpStatus == 307) {
|
||||
@Nullable
|
||||
String docLocation = doc.getHeader("Location");
|
||||
@Nullable
|
||||
String resultLocation = resultOk.header("Location");
|
||||
|
||||
return Objects.equals(docLocation, resultLocation);
|
||||
}
|
||||
else {
|
||||
return doc.httpStatus == resultOk.statusCode();
|
||||
}
|
||||
}
|
||||
|
||||
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
||||
}
|
||||
|
@@ -41,6 +41,8 @@ dependencies {
|
||||
implementation libs.snakeyaml
|
||||
implementation libs.zstd
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -1,6 +1,9 @@
|
||||
package nu.marginalia.model.body;
|
||||
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.message.BasicHeader;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@@ -11,8 +14,10 @@ import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
*/
|
||||
@@ -56,7 +61,7 @@ public sealed interface HttpFetchResult {
|
||||
*/
|
||||
record ResultOk(URI uri,
|
||||
int statusCode,
|
||||
HttpHeaders headers,
|
||||
Header[] headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw, // raw data for the entire response including headers
|
||||
int bytesStart,
|
||||
@@ -67,18 +72,19 @@ public sealed interface HttpFetchResult {
|
||||
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||
}
|
||||
|
||||
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||
private static Header[] convertHeaders(MessageHeaders messageHeaders) {
|
||||
List<Header> headers = new ArrayList<>(12);
|
||||
|
||||
inputMap.forEach((k, v) -> {
|
||||
messageHeaders.map().forEach((k, v) -> {
|
||||
if (k.isBlank()) return;
|
||||
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||
|
||||
filteredMap.put(k, v);
|
||||
for (var value : v) {
|
||||
headers.add(new BasicHeader(k, value));
|
||||
}
|
||||
});
|
||||
|
||||
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||
return headers.toArray(new Header[0]);
|
||||
}
|
||||
|
||||
public boolean isOk() {
|
||||
@@ -108,7 +114,13 @@ public sealed interface HttpFetchResult {
|
||||
|
||||
@Nullable
|
||||
public String header(String name) {
|
||||
return headers.firstValue(name).orElse(null);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase(name)) {
|
||||
String headerValue = header.getValue();
|
||||
return headerValue;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -132,6 +144,12 @@ public sealed interface HttpFetchResult {
|
||||
}
|
||||
}
|
||||
|
||||
record ResultRedirect(EdgeUrl url) implements HttpFetchResult {
|
||||
public boolean isOk() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Fetching resulted in a HTTP 304, the remote content is identical to
|
||||
* our reference copy. This will be replaced with a Result304ReplacedWithReference
|
||||
* at a later stage.
|
||||
|
@@ -102,7 +102,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private String getHeader(String header) {
|
||||
public String getHeader(String header) {
|
||||
if (headers == null) {
|
||||
return null;
|
||||
}
|
||||
|
@@ -165,12 +165,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
String etag = null;
|
||||
String lastModified = null;
|
||||
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Has-Cookies")) {
|
||||
hasCookies = hasCookies || header.getValue().equals("1");
|
||||
}
|
||||
else if (header.getName().equalsIgnoreCase("ETag")) {
|
||||
etag = header.getValue();
|
||||
}
|
||||
else if (header.getName().equalsIgnoreCase("Last-Modified")) {
|
||||
lastModified = header.getValue();
|
||||
}
|
||||
else {
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
String headersStr = headersStrBuilder.toString();
|
||||
|
||||
|
||||
@@ -178,14 +192,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
domain,
|
||||
response.target(),
|
||||
fetchOk.ipAddress(),
|
||||
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
||||
hasCookies,
|
||||
fetchOk.statusCode(),
|
||||
response.date(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
headersStr,
|
||||
headers.firstValue("ETag").orElse(null),
|
||||
headers.firstValue("Last-Modified").orElse(null)
|
||||
etag,
|
||||
lastModified
|
||||
));
|
||||
}
|
||||
|
||||
|
@@ -341,12 +341,15 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
|
||||
String headersStr;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||
hasCookies = true;
|
||||
}
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
|
||||
@@ -355,7 +358,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
domain,
|
||||
response.target(),
|
||||
fetchOk.ipAddress(),
|
||||
"1".equals(headers.firstValue("X-Cookies").orElse("0")),
|
||||
hasCookies,
|
||||
fetchOk.statusCode(),
|
||||
response.date().toEpochMilli(),
|
||||
contentType,
|
||||
|
@@ -8,6 +8,7 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
@@ -47,8 +48,8 @@ class DomainStateDbTest {
|
||||
db.save(allFields);
|
||||
db.save(minFields);
|
||||
|
||||
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
||||
assertEquals(allFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(minFields, db.getSummary("min.marginalia.nu").orElseThrow());
|
||||
|
||||
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||
"all.marginalia.nu",
|
||||
@@ -59,7 +60,19 @@ class DomainStateDbTest {
|
||||
);
|
||||
|
||||
db.save(updatedAllFields);
|
||||
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
||||
assertEquals(updatedAllFields, db.getSummary("all.marginalia.nu").orElseThrow());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMetadata() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
var original = new DomainStateDb.CrawlMeta("example.com", Instant.ofEpochMilli(12345), Duration.ofMillis(30), Duration.ofMillis(300), 1, 2, 3);
|
||||
db.save(original);
|
||||
|
||||
var maybeMeta = db.getMeta("example.com");
|
||||
assertTrue(maybeMeta.isPresent());
|
||||
assertEquals(original, maybeMeta.get());
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -0,0 +1,146 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
private static EdgeUrl contentTypeHtmlUrl;
|
||||
private static EdgeUrl contentTypeBinaryUrl;
|
||||
private static EdgeUrl redirectUrl;
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl onlyGetAllowedUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||
|
||||
contentTypeHtmlUrl = new EdgeUrl("http://localhost:18089/test.html.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeHtmlUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
|
||||
contentTypeBinaryUrl = new EdgeUrl("http://localhost:18089/test.bad.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeBinaryUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/octet-stream")
|
||||
.withStatus(200)));
|
||||
|
||||
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
|
||||
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
|
||||
onlyGetAllowedUrl = new EdgeUrl("http://localhost:18089/onlyget.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withStatus(405))); // Method Not Allowed
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtml() {
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBinary() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeRedirect() {
|
||||
var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBadHttpStatus() {
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOnlyGetAllowed() {
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimeout() {
|
||||
var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,95 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo("/timeout"))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||
|
||||
wireMockServer.start();
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout");
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomain() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainProtoUpgrade() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("http://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainRedirect() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("http://search.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Redirect(new EdgeDomain("marginalia-search.com")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainOnlyGET() throws URISyntaxException {
|
||||
// This test is to check if the domain probe only allows GET requests
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://marginalia-search.com/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://marginalia-search.com/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainError() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://invalid.example.com/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainTimeout() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(timeoutUrl);
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe"), result);
|
||||
}
|
||||
}
|
@@ -0,0 +1,381 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplFetchTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static String etag = "etag";
|
||||
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
|
||||
|
||||
private static EdgeUrl okUrl;
|
||||
private static EdgeUrl okUrlSetsCookie;
|
||||
private static EdgeUrl okRangeResponseUrl;
|
||||
private static EdgeUrl okUrlWith304;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
private static EdgeUrl redirectUrl;
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl keepAliveUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000)
|
||||
)); // 15 seconds delay to simulate timeout
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000)
|
||||
.withBody("Hello World")
|
||||
)); // 15 seconds delay to simulate timeout
|
||||
|
||||
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
|
||||
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
|
||||
okUrl = new EdgeUrl("http://localhost:18089/ok.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("ETag", etag)
|
||||
.withHeader("Last-Modified", lastModified)
|
||||
.withStatus(304)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("ETag", etag)
|
||||
.withHeader("Last-Modified", lastModified)
|
||||
.withStatus(304)));
|
||||
|
||||
okRangeResponseUrl = new EdgeUrl("http://localhost:18089/okRangeResponse.bin");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okRangeResponseUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Range", "bytes 0-100/200")
|
||||
.withBody("Hello World")
|
||||
.withStatus(206)));
|
||||
|
||||
keepAliveUrl = new EdgeUrl("http://localhost:18089/keepalive.bin");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(keepAliveUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)
|
||||
.withHeader("Keep-Alive", "max=4, timeout=30")
|
||||
.withBody("Hello")
|
||||
));
|
||||
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
|
||||
WarcRecorder warcRecorder;
|
||||
Path warcFile;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
|
||||
warcRecorder = new WarcRecorder(warcFile);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
System.out.println(stats);
|
||||
|
||||
fetcher.close();
|
||||
warcRecorder.close();
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFoo() {
|
||||
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOkSetsCookie() throws IOException {
|
||||
var cookies = new DomainCookies();
|
||||
var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk304_NoProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk304_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadStatus_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadStatus_FullProbe() {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRedirect_FullProbe() throws URISyntaxException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||
|
||||
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||
|
||||
var records = getWarcRecords();
|
||||
Assertions.assertEquals(1, records.size());
|
||||
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRangeResponse() throws IOException {
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
var response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("length", response.headers().first("WARC-Truncated").orElse(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
|
||||
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||
|
||||
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||
|
||||
var records = getWarcRecords();
|
||||
Assertions.assertEquals(1, records.size());
|
||||
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeepaliveUrl() {
|
||||
// mostly for smoke testing and debugger utility
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
|
||||
private List<WarcRecord> getWarcRecords() throws IOException {
|
||||
List<WarcRecord> records = new ArrayList<>();
|
||||
|
||||
System.out.println(Files.readString(warcFile));
|
||||
|
||||
try (var reader = new WarcReader(warcFile)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
WarcXEntityRefused.register(reader);
|
||||
|
||||
for (var record : reader) {
|
||||
// Load the body, we need to do this before we close the reader to have access to the content.
|
||||
if (record instanceof WarcRequest req) {
|
||||
req.http();
|
||||
} else if (record instanceof WarcResponse rsp) {
|
||||
rsp.http();
|
||||
}
|
||||
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,9 +1,12 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -13,8 +16,6 @@ import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
@@ -30,8 +31,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClient.newBuilder()
|
||||
.build();
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileName = Files.createTempFile("test", ".warc.gz");
|
||||
outputFile = Files.createTempFile("test", ".warc.gz");
|
||||
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
@Test
|
||||
void run() throws IOException, URISyntaxException {
|
||||
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||
|
||||
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||
}
|
||||
|
||||
@@ -78,11 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
||||
}
|
||||
|
||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
var req = HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI(url))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build();
|
||||
recorder.fetch(httpClient, req);
|
||||
HttpGet request = new HttpGet(url);
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
recorder.fetch(httpClient, new DomainCookies(), request);
|
||||
}
|
||||
}
|
@@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -32,7 +32,6 @@ class ContentTypeProberTest {
|
||||
static EdgeUrl timeoutEndpoint;
|
||||
|
||||
static Path warcFile;
|
||||
static WarcRecorder recorder;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
@@ -80,21 +79,17 @@ class ContentTypeProberTest {
|
||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||
|
||||
fetcher = new HttpFetcherImpl("test");
|
||||
recorder = new WarcRecorder(warcFile, new Cookies());
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
server.stop(0);
|
||||
fetcher.close();
|
||||
recorder.close();
|
||||
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeOk() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -103,16 +98,16 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeRedir() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Redirect(htmlEndpoint));
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeBad() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -121,7 +116,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeTimeout() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
|
@@ -0,0 +1,162 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.WarcReader;
|
||||
import org.netpreserve.jwarc.WarcRequest;
|
||||
import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Tag("slow")
|
||||
class WarcRecorderFakeServerTest {
|
||||
static HttpServer server;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpAll() throws IOException {
|
||||
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
|
||||
|
||||
// This endpoint will finish sending the response immediately
|
||||
server.createContext("/fast", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
|
||||
|
||||
try (var os = exchange.getResponseBody()) {
|
||||
os.write("<html><body>hello</body></html>".getBytes());
|
||||
os.flush();
|
||||
}
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
// This endpoint will take 10 seconds to finish sending the response,
|
||||
// which should trigger a timeout in the client
|
||||
server.createContext("/slow", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
|
||||
|
||||
try (var os = exchange.getResponseBody()) {
|
||||
os.write("<html><body>hello</body></html>".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
os.write(":".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
os.write("D".getBytes());
|
||||
os.flush();
|
||||
}
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
server.start();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
server.stop(0);
|
||||
}
|
||||
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
WarcRecorder client;
|
||||
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
|
||||
client.close();
|
||||
Files.delete(fileNameWarc);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fetchFast() throws Exception {
|
||||
HttpGet request = new HttpGet("http://localhost:14510/fast");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
warcReader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
sampleData.put(record.type(), req.target());
|
||||
}
|
||||
if (record instanceof WarcResponse rsp) {
|
||||
sampleData.put(record.type(), rsp.target());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println(sampleData);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fetchSlow() throws Exception {
|
||||
Instant start = Instant.now();
|
||||
|
||||
HttpGet request = new HttpGet("http://localhost:14510/slow");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient,
|
||||
new DomainCookies(),
|
||||
request,
|
||||
Duration.ofSeconds(1)
|
||||
);
|
||||
Instant end = Instant.now();
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
warcReader.forEach(record -> {
|
||||
if (record instanceof WarcRequest req) {
|
||||
sampleData.put(record.type(), req.target());
|
||||
}
|
||||
if (record instanceof WarcResponse rsp) {
|
||||
sampleData.put(record.type(), rsp.target());
|
||||
System.out.println(rsp.target());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println(
|
||||
Files.readString(fileNameWarc));
|
||||
System.out.println(sampleData);
|
||||
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||
// so we expect the request to take 1s and change before it times out.
|
||||
|
||||
Assertions.assertTrue(Duration.between(start, end).toMillis() < 3000);
|
||||
}
|
||||
|
||||
}
|
@@ -2,11 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -17,8 +20,6 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
@@ -35,12 +36,12 @@ class WarcRecorderTest {
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClient.newBuilder().build();
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -51,13 +52,12 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build()
|
||||
);
|
||||
|
||||
HttpGet request = new HttpGet("https://www.marginalia.nu/");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -78,8 +78,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -102,8 +103,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
null,
|
||||
@@ -114,8 +116,9 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -138,23 +141,23 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
|
||||
request1.addHeader("User-agent", "test.marginalia.nu");
|
||||
request1.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
client.fetch(httpClient, new DomainCookies(), request1);
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
|
||||
request2.addHeader("User-agent", "test.marginalia.nu");
|
||||
request2.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request2);
|
||||
|
||||
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
|
||||
request3.addHeader("User-agent", "test.marginalia.nu");
|
||||
request3.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -31,7 +32,7 @@ class HttpFetcherTest {
|
||||
void fetchUTF8() throws Exception {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
@@ -49,7 +50,7 @@ class HttpFetcherTest {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
|
@@ -15,6 +15,9 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -24,7 +27,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@@ -120,7 +122,7 @@ public class CrawlerMockFetcherTest {
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||
|
||||
@Override
|
||||
public Cookies getCookies() { return new Cookies();}
|
||||
public CookieStore getCookies() { return new BasicCookieStore();}
|
||||
|
||||
@Override
|
||||
public void clearCookies() {}
|
||||
@@ -132,13 +134,7 @@ public class CrawlerMockFetcherTest {
|
||||
}
|
||||
|
||||
@Override
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
||||
logger.info("Probing {}", url);
|
||||
return new HttpFetcher.ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
||||
@@ -147,7 +143,7 @@ public class CrawlerMockFetcherTest {
|
||||
return new HttpFetchResult.ResultOk(
|
||||
url.asURI(),
|
||||
200,
|
||||
HttpHeaders.of(Map.of(), (k,v)->true),
|
||||
new Header[0],
|
||||
"127.0.0.1",
|
||||
bodyBytes,
|
||||
0,
|
||||
|
@@ -5,7 +5,6 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -16,7 +15,7 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -37,16 +36,16 @@ class CrawlerRetreiverTest {
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
Path tempFileWarc1;
|
||||
Path tempFileParquet1;
|
||||
Path tempFileSlop1;
|
||||
Path tempFileWarc2;
|
||||
Path tempFileParquet2;
|
||||
Path tempFileSlop2;
|
||||
Path tempFileWarc3;
|
||||
Path tempFileDb;
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
||||
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
||||
tempFileSlop1 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
tempFileSlop2 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
||||
|
||||
}
|
||||
@@ -62,14 +61,14 @@ class CrawlerRetreiverTest {
|
||||
if (tempFileWarc1 != null) {
|
||||
Files.deleteIfExists(tempFileWarc1);
|
||||
}
|
||||
if (tempFileParquet1 != null) {
|
||||
Files.deleteIfExists(tempFileParquet1);
|
||||
if (tempFileSlop1 != null) {
|
||||
Files.deleteIfExists(tempFileSlop1);
|
||||
}
|
||||
if (tempFileWarc2 != null) {
|
||||
Files.deleteIfExists(tempFileWarc2);
|
||||
}
|
||||
if (tempFileParquet2 != null) {
|
||||
Files.deleteIfExists(tempFileParquet2);
|
||||
if (tempFileSlop2 != null) {
|
||||
Files.deleteIfExists(tempFileSlop2);
|
||||
}
|
||||
if (tempFileWarc3 != null) {
|
||||
Files.deleteIfExists(tempFileWarc3);
|
||||
@@ -180,7 +179,7 @@ class CrawlerRetreiverTest {
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2, new Cookies())
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -224,9 +223,9 @@ class CrawlerRetreiverTest {
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -277,9 +276,9 @@ class CrawlerRetreiverTest {
|
||||
assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||
assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -293,7 +292,7 @@ class CrawlerRetreiverTest {
|
||||
// redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash)
|
||||
//
|
||||
// Ensure that the redirect is followed, and that the trailing slash is added
|
||||
// to the url as reported in the parquet file.
|
||||
// to the url as reported in the Slop file.
|
||||
|
||||
var fetchedUrls =
|
||||
data.stream()
|
||||
@@ -326,9 +325,9 @@ class CrawlerRetreiverTest {
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -373,11 +372,11 @@ class CrawlerRetreiverTest {
|
||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
doCrawlWithReferenceStream(specs,
|
||||
new CrawlDataReference(tempFileParquet1)
|
||||
new CrawlDataReference(tempFileSlop1)
|
||||
);
|
||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||
convertToSlop(tempFileWarc2, tempFileSlop2);
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
@@ -396,7 +395,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -411,9 +410,9 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) {
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test", "test"), tempFileWarc2, tempFileParquet2);
|
||||
private void convertToSlop(Path tempFileWarc2, Path tempFileSlop2) throws IOException {
|
||||
SlopCrawlDataRecord.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test", "test"), tempFileWarc2, tempFileSlop2);
|
||||
}
|
||||
|
||||
|
||||
@@ -436,9 +435,9 @@ class CrawlerRetreiverTest {
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||
@@ -449,14 +448,14 @@ class CrawlerRetreiverTest {
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileSlop1));
|
||||
|
||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3, new Cookies())
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -465,7 +464,7 @@ class CrawlerRetreiverTest {
|
||||
resync.run(tempFileWarc2);
|
||||
|
||||
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
||||
convertToParquet(tempFileWarc3, tempFileParquet2);
|
||||
convertToSlop(tempFileWarc3, tempFileSlop2);
|
||||
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc3)) {
|
||||
@@ -485,7 +484,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -507,7 +506,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||
@@ -519,7 +518,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@NotNull
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||
|
@@ -44,6 +44,7 @@ dependencies {
|
||||
implementation libs.guice
|
||||
implementation libs.fastutil
|
||||
implementation libs.trove
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -121,11 +121,12 @@ public class IntegrationTest {
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
new DomainCookies(),
|
||||
"text/html", 200,
|
||||
"""
|
||||
<html>
|
||||
|
@@ -179,8 +179,9 @@ dependencyResolutionManagement {
|
||||
|
||||
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
|
||||
|
||||
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
||||
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
||||
|
||||
library('httpcore', 'org.apache.httpcomponents.core5','httpcore5').version('5.3.4')
|
||||
library('httpclient', 'org.apache.httpcomponents.client5','httpclient5').version('5.4.3')
|
||||
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
||||
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
||||
library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
|
||||
@@ -255,7 +256,7 @@ dependencyResolutionManagement {
|
||||
bundle('grpc', ['protobuf', 'grpc-stub', 'grpc-protobuf', 'grpc-netty'])
|
||||
bundle('protobuf', ['protobuf', 'javax.annotation'])
|
||||
bundle('gson', ['gson', 'gson-type-adapter'])
|
||||
bundle('httpcomponents', ['httpcomponents.core', 'httpcomponents.client'])
|
||||
bundle('httpcomponents', ['httpcore', 'httpclient'])
|
||||
bundle('parquet', ['parquet-column', 'parquet-hadoop'])
|
||||
bundle('junit', ['junit.jupiter', 'junit.jupiter.engine'])
|
||||
bundle('flyway', ['flyway.core', 'flyway.mysql'])
|
||||
|
Reference in New Issue
Block a user