mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
25 Commits
deploy-011
...
deploy-012
Author | SHA1 | Date | |
---|---|---|---|
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 | ||
|
b6265cee11 | ||
|
c91af247e9 | ||
|
7a31227de1 | ||
|
4f477604c5 | ||
|
2970f4395b | ||
|
d1ec909b36 |
@@ -35,21 +35,8 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forExpensiveRequest() {
|
|
||||||
return new RateLimiter(5, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter custom(int perMinute) {
|
public static RateLimiter custom(int perMinute) {
|
||||||
return new RateLimiter(perMinute, 60);
|
return new RateLimiter(4 * perMinute, perMinute);
|
||||||
}
|
|
||||||
|
|
||||||
public static RateLimiter forSpamBots() {
|
|
||||||
return new RateLimiter(120, 3600);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static RateLimiter forLogin() {
|
|
||||||
return new RateLimiter(3, 15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void cleanIdleBuckets() {
|
private void cleanIdleBuckets() {
|
||||||
@@ -62,7 +49,7 @@ public class RateLimiter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(refillRate, Duration.ofSeconds(60));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -13,9 +14,20 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
<SizeBasedTriggeringPolicy size="10MB" />
|
<SizeBasedTriggeringPolicy size="10MB" />
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
<Loggers>
|
<Loggers>
|
||||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
<Filters>
|
<Filters>
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</Console>
|
</Console>
|
||||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
@@ -17,6 +18,17 @@
|
|||||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||||
|
</Filters>
|
||||||
|
</RollingFile>
|
||||||
|
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||||
|
ignoreExceptions="false">
|
||||||
|
<PatternLayout>
|
||||||
|
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||||
|
</PatternLayout>
|
||||||
|
<SizeBasedTriggeringPolicy size="100MB" />
|
||||||
|
<Filters>
|
||||||
|
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||||
</Filters>
|
</Filters>
|
||||||
</RollingFile>
|
</RollingFile>
|
||||||
</Appenders>
|
</Appenders>
|
||||||
|
@@ -33,6 +33,7 @@ import java.sql.SQLException;
|
|||||||
import java.time.*;
|
import java.time.*;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -71,7 +72,7 @@ public class FeedFetcherService {
|
|||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
CLEAN,
|
CLEAN,
|
||||||
REFRESH
|
REFRESH
|
||||||
};
|
}
|
||||||
|
|
||||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||||
if (updating) // Prevent concurrent updates
|
if (updating) // Prevent concurrent updates
|
||||||
@@ -87,6 +88,7 @@ public class FeedFetcherService {
|
|||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
.version(HttpClient.Version.HTTP_2)
|
.version(HttpClient.Version.HTTP_2)
|
||||||
.build();
|
.build();
|
||||||
|
ExecutorService fetchExecutor = Executors.newCachedThreadPool();
|
||||||
FeedJournal feedJournal = FeedJournal.create();
|
FeedJournal feedJournal = FeedJournal.create();
|
||||||
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
var heartbeat = serviceHeartbeat.createServiceAdHocTaskHeartbeat("Update Rss Feeds")
|
||||||
) {
|
) {
|
||||||
@@ -131,7 +133,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
FetchResult feedData;
|
FetchResult feedData;
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
feedData = fetchFeedData(feed, client, fetchExecutor, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
feedData = new FetchResult.TransientError();
|
feedData = new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
@@ -211,6 +213,7 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
HttpClient client,
|
HttpClient client,
|
||||||
|
ExecutorService executorService,
|
||||||
@Nullable String ifModifiedSinceDate,
|
@Nullable String ifModifiedSinceDate,
|
||||||
@Nullable String ifNoneMatchTag)
|
@Nullable String ifNoneMatchTag)
|
||||||
{
|
{
|
||||||
@@ -237,7 +240,14 @@ public class FeedFetcherService {
|
|||||||
HttpRequest getRequest = requestBuilder.build();
|
HttpRequest getRequest = requestBuilder.build();
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
|
||||||
|
/* Note we need to use an executor to time-limit the send() method in HttpClient, as
|
||||||
|
* its support for timeouts only applies to the time until response starts to be received,
|
||||||
|
* and does not catch the case when the server starts to send data but then hangs.
|
||||||
|
*/
|
||||||
|
HttpResponse<byte[]> rs = executorService.submit(
|
||||||
|
() -> client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()))
|
||||||
|
.get(15, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (rs.statusCode() == 429) { // Too Many Requests
|
if (rs.statusCode() == 429) { // Too Many Requests
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||||
|
@@ -87,6 +87,8 @@ dependencies {
|
|||||||
implementation libs.commons.compress
|
implementation libs.commons.compress
|
||||||
implementation libs.sqlite
|
implementation libs.sqlite
|
||||||
|
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -21,6 +20,7 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -247,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
|
||||||
var db = new DomainStateDb(dbTempFile))
|
var db = new DomainStateDb(dbTempFile))
|
||||||
{
|
{
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
|
@@ -60,10 +60,14 @@ dependencies {
|
|||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
testImplementation libs.wiremock
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
testImplementation project(':code:processes:test-data')
|
testImplementation project(':code:processes:test-data')
|
||||||
}
|
}
|
||||||
|
@@ -501,7 +501,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
return new CrawlDataReference(slopPath);
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
logger.debug("Failed to read previous crawl data for {}", specification.domain());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import java.net.http.HttpRequest;
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
|
||||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||||
public record ContentTags(String etag, String lastMod) {
|
public record ContentTags(String etag, String lastMod) {
|
||||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Paints the tags onto the request builder. */
|
/** Paints the tags onto the request builder. */
|
||||||
public void paint(HttpRequest.Builder getBuilder) {
|
public void paint(ClassicRequestBuilder getBuilder) {
|
||||||
|
|
||||||
if (etag != null) {
|
if (etag != null) {
|
||||||
getBuilder.header("If-None-Match", etag);
|
getBuilder.addHeader("If-None-Match", etag);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastMod != null) {
|
if (lastMod != null) {
|
||||||
getBuilder.header("If-Modified-Since", lastMod);
|
getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@@ -15,20 +16,16 @@ import java.util.List;
|
|||||||
public interface HttpFetcher extends AutoCloseable {
|
public interface HttpFetcher extends AutoCloseable {
|
||||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||||
|
|
||||||
Cookies getCookies();
|
CookieStore getCookies();
|
||||||
void clearCookies();
|
void clearCookies();
|
||||||
|
|
||||||
DomainProbeResult probeDomain(EdgeUrl url);
|
DomainProbeResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
ContentTypeProbeResult probeContentType(
|
|
||||||
EdgeUrl url,
|
|
||||||
WarcRecorder recorder,
|
|
||||||
ContentTags tags) throws HttpFetcherImpl.RateLimitException;
|
|
||||||
|
|
||||||
HttpFetchResult fetchContent(EdgeUrl url,
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder recorder,
|
WarcRecorder recorder,
|
||||||
|
CrawlDelayTimer timer,
|
||||||
ContentTags tags,
|
ContentTags tags,
|
||||||
ProbeType probeType) throws Exception;
|
ProbeType probeType);
|
||||||
|
|
||||||
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||||
|
|
||||||
@@ -46,6 +43,7 @@ public interface HttpFetcher extends AutoCloseable {
|
|||||||
|
|
||||||
/** This domain redirects to another domain */
|
/** This domain redirects to another domain */
|
||||||
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
||||||
|
record RedirectSameDomain_Internal(EdgeUrl domain) implements DomainProbeResult {}
|
||||||
|
|
||||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||||
@@ -56,7 +54,10 @@ public interface HttpFetcher extends AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sealed interface ContentTypeProbeResult {
|
sealed interface ContentTypeProbeResult {
|
||||||
|
record NoOp() implements ContentTypeProbeResult {}
|
||||||
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
||||||
|
record HttpError(int statusCode, String message) implements ContentTypeProbeResult { }
|
||||||
|
record Redirect(EdgeUrl location) implements ContentTypeProbeResult { }
|
||||||
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
||||||
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||||
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||||
|
@@ -5,79 +5,146 @@ import com.google.inject.Singleton;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
|
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||||
|
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||||
|
import org.apache.hc.client5.http.config.RequestConfig;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||||
|
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||||
|
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||||
|
import org.apache.hc.core5.http.*;
|
||||||
|
import org.apache.hc.core5.http.io.HttpClientResponseHandler;
|
||||||
|
import org.apache.hc.core5.http.io.SocketConfig;
|
||||||
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
import org.apache.hc.core5.http.message.MessageSupport;
|
||||||
|
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||||
|
import org.apache.hc.core5.util.TimeValue;
|
||||||
|
import org.apache.hc.core5.util.Timeout;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.parser.Parser;
|
import org.jsoup.parser.Parser;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.slf4j.Marker;
|
||||||
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
|
import javax.net.ssl.SSLContext;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.net.SocketTimeoutException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.net.http.HttpRequest;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.net.http.HttpTimeoutException;
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.ExecutorService;
|
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.Semaphore;
|
import java.util.concurrent.Semaphore;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final String userAgentString;
|
private final String userAgentString;
|
||||||
private final String userAgentIdentifier;
|
private final String userAgentIdentifier;
|
||||||
private final Cookies cookies = new Cookies();
|
|
||||||
|
private final CookieStore cookies = new BasicCookieStore();
|
||||||
|
|
||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
private final Marker crawlerAuditMarker = MarkerFactory.getMarker("CRAWLER");
|
||||||
|
|
||||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
private final LinkParser linkParser = new LinkParser();
|
||||||
private final Duration probeTimeout = Duration.ofSeconds(30);
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final HttpClient client;
|
private final CloseableHttpClient client;
|
||||||
|
|
||||||
private HttpClient createClient() {
|
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||||
final ExecutorService executorService;
|
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||||
|
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||||
|
.build();
|
||||||
|
|
||||||
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
|
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||||
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
.setMaxConnPerRoute(2)
|
||||||
}
|
.setMaxConnTotal(5000)
|
||||||
else {
|
.setDefaultConnectionConfig(connectionConfig)
|
||||||
executorService = Executors.newCachedThreadPool();
|
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||||
}
|
.build();
|
||||||
|
|
||||||
return HttpClient.newBuilder()
|
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||||
.sslContext(NoSecuritySSL.buildSslContext())
|
.setSoLinger(TimeValue.ofSeconds(15))
|
||||||
.cookieHandler(cookies)
|
.setSoTimeout(Timeout.ofSeconds(10))
|
||||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
.build()
|
||||||
.version(HttpClient.Version.HTTP_1_1)
|
);
|
||||||
.connectTimeout(Duration.ofSeconds(8))
|
|
||||||
.executor(executorService)
|
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||||
|
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||||
|
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||||
|
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
return HttpClients.custom()
|
||||||
|
.setDefaultCookieStore(cookies)
|
||||||
|
.setConnectionManager(connectionManager)
|
||||||
|
.setRetryStrategy(this)
|
||||||
|
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||||
|
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||||
|
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||||
|
//
|
||||||
|
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||||
|
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||||
|
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||||
|
|
||||||
|
while (it.hasNext()) {
|
||||||
|
final HeaderElement he = it.next();
|
||||||
|
final String param = he.getName();
|
||||||
|
final String value = he.getValue();
|
||||||
|
|
||||||
|
if (value == null)
|
||||||
|
continue;
|
||||||
|
if (!"timeout".equalsIgnoreCase(param))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
long timeout = Long.parseLong(value);
|
||||||
|
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||||
|
return TimeValue.ofSeconds(timeout);
|
||||||
|
} catch (final NumberFormatException ignore) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.disableRedirectHandling()
|
||||||
|
.setDefaultRequestConfig(defaultRequestConfig)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Cookies getCookies() {
|
public CookieStore getCookies() {
|
||||||
return cookies;
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -89,19 +156,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(UserAgent userAgent)
|
public HttpFetcherImpl(UserAgent userAgent)
|
||||||
{
|
{
|
||||||
this.client = createClient();
|
try {
|
||||||
|
this.client = createClient();
|
||||||
|
} catch (NoSuchAlgorithmException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
this.userAgentString = userAgent.uaString();
|
this.userAgentString = userAgent.uaString();
|
||||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
this.client = createClient();
|
try {
|
||||||
|
this.client = createClient();
|
||||||
|
} catch (NoSuchAlgorithmException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
this.userAgentString = userAgent;
|
this.userAgentString = userAgent;
|
||||||
this.userAgentIdentifier = userAgent;
|
this.userAgentIdentifier = userAgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Not necessary in prod, but useful in test
|
// Not necessary in prod, but useful in test
|
||||||
public void close() {
|
public void close() throws IOException {
|
||||||
client.close();
|
client.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,34 +189,94 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||||
HttpRequest head;
|
List<EdgeUrl> urls = new ArrayList<>();
|
||||||
try {
|
urls.add(url);
|
||||||
head = HttpRequest.newBuilder()
|
|
||||||
.HEAD()
|
|
||||||
.uri(url.asURI())
|
|
||||||
.header("User-agent", userAgentString)
|
|
||||||
.timeout(probeTimeout)
|
|
||||||
.build();
|
|
||||||
} catch (URISyntaxException e) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int tries = 0;; tries++) {
|
int redirects = 0;
|
||||||
|
AtomicBoolean tryGet = new AtomicBoolean(false);
|
||||||
|
|
||||||
|
while (!urls.isEmpty() && ++redirects < 5) {
|
||||||
|
ClassicHttpRequest request;
|
||||||
|
|
||||||
|
EdgeUrl topUrl = urls.removeFirst();
|
||||||
try {
|
try {
|
||||||
var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
|
if (tryGet.get()) {
|
||||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
request = ClassicRequestBuilder.get(topUrl.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
.addHeader("Range", "bytes=0-255")
|
||||||
|
.build();
|
||||||
|
} else {
|
||||||
|
request = ClassicRequestBuilder.head(topUrl.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(rspUri);
|
} catch (URISyntaxException e) {
|
||||||
} catch (Exception ex) {
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
if (tries > 3) {
|
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
|
||||||
}
|
|
||||||
// else try again ...
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
var result = SendLock.wrapSend(client, request, response -> {
|
||||||
|
EntityUtils.consume(response.getEntity());
|
||||||
|
|
||||||
|
return switch (response.getCode()) {
|
||||||
|
case 200 -> new DomainProbeResult.Ok(url);
|
||||||
|
case 405 -> {
|
||||||
|
if (!tryGet.get()) {
|
||||||
|
tryGet.set(true);
|
||||||
|
yield new DomainProbeResult.RedirectSameDomain_Internal(url);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status 405, tried HEAD and GET?!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case 301, 302, 307 -> {
|
||||||
|
var location = response.getFirstHeader("Location");
|
||||||
|
|
||||||
|
if (location != null) {
|
||||||
|
Optional<EdgeUrl> newUrl = linkParser.parseLink(topUrl, location.getValue());
|
||||||
|
if (newUrl.isEmpty()) {
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid location header on redirect");
|
||||||
|
}
|
||||||
|
EdgeUrl newEdgeUrl = newUrl.get();
|
||||||
|
if (newEdgeUrl.domain.equals(topUrl.domain)) {
|
||||||
|
yield new DomainProbeResult.RedirectSameDomain_Internal(newEdgeUrl);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
yield new DomainProbeResult.Redirect(newEdgeUrl.domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "No location header on redirect");
|
||||||
|
|
||||||
|
}
|
||||||
|
default ->
|
||||||
|
new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status " + response.getCode());
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
if (result instanceof DomainProbeResult.RedirectSameDomain_Internal(EdgeUrl redirUrl)) {
|
||||||
|
urls.add(redirUrl);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We don't have robots.txt yet, so we'll assume a request delay of 1 second
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
}
|
||||||
|
catch (SocketTimeoutException ex) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe");
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Failed to resolve domain root");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Perform a HEAD request to fetch the content type of a URL.
|
/** Perform a HEAD request to fetch the content type of a URL.
|
||||||
@@ -152,70 +287,72 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
* recorded in the WARC file on failure.
|
* recorded in the WARC file on failure.
|
||||||
*/
|
*/
|
||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
CrawlDelayTimer timer,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) {
|
||||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
|
return new ContentTypeProbeResult.NoOp();
|
||||||
try {
|
}
|
||||||
var headBuilder = HttpRequest.newBuilder()
|
|
||||||
.HEAD()
|
try {
|
||||||
.uri(url.asURI())
|
ClassicHttpRequest head = ClassicRequestBuilder.head(url.asURI())
|
||||||
.header("User-Agent", userAgentString)
|
.addHeader("User-Agent", userAgentString)
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.timeout(requestTimeout)
|
.build();
|
||||||
;
|
|
||||||
|
var result = SendLock.wrapSend(client, head, (rsp) -> {
|
||||||
var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
EntityUtils.consume(rsp.getEntity());
|
||||||
var headers = rsp.headers();
|
|
||||||
|
int statusCode = rsp.getCode();
|
||||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
|
||||||
|
// Handle redirects
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
if (statusCode == 301 || statusCode == 302 || statusCode == 307) {
|
||||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
var location = rsp.getFirstHeader("Location");
|
||||||
|
if (location != null) {
|
||||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
Optional<EdgeUrl> newUrl = linkParser.parseLink(url, location.getValue());
|
||||||
}
|
if (newUrl.isEmpty())
|
||||||
|
return new ContentTypeProbeResult.HttpError(statusCode, "Invalid location header on redirect");
|
||||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
return new ContentTypeProbeResult.Redirect(newUrl.get());
|
||||||
|
}
|
||||||
// HEAD 301 url1 -> url2
|
}
|
||||||
// HEAD 200 url2
|
|
||||||
// GET 301 url1 -> url2
|
if (statusCode == 405) {
|
||||||
// GET 200 url2
|
// If we get a 405, we can't probe the content type with HEAD, so we'll just say it's ok
|
||||||
|
return new ContentTypeProbeResult.Ok(url);
|
||||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
}
|
||||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
|
||||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
// Handle errors
|
||||||
|
if (statusCode < 200 || statusCode > 300) {
|
||||||
var redirectUrl = new EdgeUrl(rsp.uri());
|
return new ContentTypeProbeResult.HttpError(statusCode, "Bad status code");
|
||||||
EdgeUrl ret;
|
}
|
||||||
|
|
||||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
// Handle missing content type
|
||||||
else ret = url;
|
var ctHeader = rsp.getFirstHeader("Content-Type");
|
||||||
|
if (ctHeader == null) {
|
||||||
// Intercept rate limiting
|
return new ContentTypeProbeResult.HttpError(statusCode, "Missing Content-Type header");
|
||||||
if (rsp.statusCode() == 429) {
|
}
|
||||||
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
var contentType = ctHeader.getValue();
|
||||||
}
|
|
||||||
|
// Check if the content type is allowed
|
||||||
return new ContentTypeProbeResult.Ok(ret);
|
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||||
}
|
return new ContentTypeProbeResult.Ok(url);
|
||||||
catch (HttpTimeoutException ex) {
|
} else {
|
||||||
warcRecorder.flagAsTimeout(url);
|
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||||
return new ContentTypeProbeResult.Timeout(ex);
|
}
|
||||||
}
|
});
|
||||||
catch (RateLimitException ex) {
|
|
||||||
throw ex;
|
return result;
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (SocketTimeoutException ex) {
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
|
||||||
|
return new ContentTypeProbeResult.Timeout(ex);
|
||||||
warcRecorder.flagAsError(url, ex);
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
return new ContentTypeProbeResult.Exception(ex);
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
}
|
return new ContentTypeProbeResult.Exception(ex);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
timer.waitFetchDelay();
|
||||||
}
|
}
|
||||||
return new ContentTypeProbeResult.Ok(url);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fetch the content of a URL, and record it in a WARC file,
|
/** Fetch the content of a URL, and record it in a WARC file,
|
||||||
@@ -225,38 +362,73 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@Override
|
@Override
|
||||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
|
CrawlDelayTimer timer,
|
||||||
ContentTags contentTags,
|
ContentTags contentTags,
|
||||||
ProbeType probeType)
|
ProbeType probeType)
|
||||||
throws Exception
|
|
||||||
{
|
{
|
||||||
var getBuilder = HttpRequest.newBuilder()
|
try {
|
||||||
.GET()
|
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||||
.uri(url.asURI())
|
try {
|
||||||
.header("User-Agent", userAgentString)
|
var probeResult = probeContentType(url, timer, contentTags);
|
||||||
.header("Accept-Encoding", "gzip")
|
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
|
||||||
.header("Accept-Language", "en,*;q=0.5")
|
switch (probeResult) {
|
||||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||||
.timeout(requestTimeout)
|
break; //
|
||||||
;
|
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||||
|
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||||
contentTags.paint(getBuilder);
|
break;
|
||||||
|
case ContentTypeProbeResult.BadContentType badContentType:
|
||||||
try (var sl = new SendLock()) {
|
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
return new HttpFetchResult.ResultNone();
|
||||||
|
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
warcRecorder.flagAsTimeout(url);
|
||||||
if (ok.statusCode() == 429) {
|
return new HttpFetchResult.ResultException(ex);
|
||||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
case ContentTypeProbeResult.Exception(Exception ex):
|
||||||
}
|
warcRecorder.flagAsError(url, ex);
|
||||||
if (ok.statusCode() == 304) {
|
return new HttpFetchResult.ResultException(ex);
|
||||||
return new HttpFetchResult.Result304Raw();
|
case ContentTypeProbeResult.HttpError httpError:
|
||||||
}
|
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||||
if (ok.statusCode() == 200) {
|
case ContentTypeProbeResult.Redirect redirect:
|
||||||
return ok;
|
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.warn("Failed to fetch {}", url, ex);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.addHeader("Accept-Language", "en,*;q=0.5")
|
||||||
|
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||||
|
|
||||||
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
|
try (var sl = new SendLock()) {
|
||||||
|
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||||
|
|
||||||
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
|
if (ok.statusCode() == 304) {
|
||||||
|
return new HttpFetchResult.Result304Raw();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (result) {
|
||||||
|
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
||||||
|
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||||
|
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||||
|
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url);
|
||||||
|
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||||
|
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -323,67 +495,60 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
|
||||||
.GET()
|
.addHeader("User-Agent", userAgentString)
|
||||||
.uri(sitemapUrl.asURI())
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.addHeader("User-Agent", userAgentString)
|
||||||
.header("User-Agent", userAgentString)
|
|
||||||
.timeout(requestTimeout)
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
try (var sl = new SendLock()) {
|
try (var sl = new SendLock()) {
|
||||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
return client.execute(getRequest, response -> {
|
||||||
if (response.statusCode() != 200) {
|
if (response.getCode() != 200) {
|
||||||
return new SitemapResult.SitemapError();
|
return new SitemapResult.SitemapError();
|
||||||
}
|
|
||||||
|
|
||||||
Document parsedSitemap;
|
|
||||||
|
|
||||||
try (InputStream inputStream = response.body()) {
|
|
||||||
InputStream parserStream;
|
|
||||||
if (sitemapUrl.path.endsWith(".gz")) {
|
|
||||||
parserStream = new GZIPInputStream(inputStream);
|
|
||||||
} else {
|
|
||||||
parserStream = inputStream;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
Document parsedSitemap = Jsoup.parse(
|
||||||
}
|
EntityUtils.toString(response.getEntity()),
|
||||||
finally {
|
sitemapUrl.toString(),
|
||||||
sl.close();
|
Parser.xmlParser()
|
||||||
}
|
);
|
||||||
|
|
||||||
if (parsedSitemap.childrenSize() == 0) {
|
if (parsedSitemap.childrenSize() == 0) {
|
||||||
return new SitemapResult.SitemapError();
|
return new SitemapResult.SitemapError();
|
||||||
}
|
}
|
||||||
|
|
||||||
String rootTagName = parsedSitemap.child(0).tagName();
|
String rootTagName = parsedSitemap.child(0).tagName();
|
||||||
|
|
||||||
return switch (rootTagName.toLowerCase()) {
|
return switch (rootTagName.toLowerCase()) {
|
||||||
case "sitemapindex" -> {
|
case "sitemapindex" -> {
|
||||||
List<String> references = new ArrayList<>();
|
List<String> references = new ArrayList<>();
|
||||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||||
references.add(locTag.text().trim());
|
references.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||||
}
|
}
|
||||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
case "urlset" -> {
|
||||||
}
|
List<String> urls = new ArrayList<>();
|
||||||
case "urlset" -> {
|
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||||
List<String> urls = new ArrayList<>();
|
urls.add(locTag.text().trim());
|
||||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
}
|
||||||
urls.add(locTag.text().trim());
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
}
|
}
|
||||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
case "rss", "atom" -> {
|
||||||
}
|
List<String> urls = new ArrayList<>();
|
||||||
case "rss", "atom" -> {
|
for (var locTag : parsedSitemap.select("link, url")) {
|
||||||
List<String> urls = new ArrayList<>();
|
urls.add(locTag.text().trim());
|
||||||
for (var locTag : parsedSitemap.select("link, url")) {
|
}
|
||||||
urls.add(locTag.text().trim());
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
}
|
}
|
||||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
default -> new SitemapResult.SitemapError();
|
||||||
}
|
};
|
||||||
default -> new SitemapResult.SitemapError();
|
});
|
||||||
};
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Error while fetching sitemap {}: {} ({})", sitemapUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -408,15 +573,14 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try (var sl = new SendLock()) {
|
try (var sl = new SendLock()) {
|
||||||
var getRequest = HttpRequest.newBuilder()
|
|
||||||
.GET()
|
|
||||||
.uri(url.asURI())
|
|
||||||
.header("Accept-Encoding", "gzip")
|
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
|
||||||
.header("User-Agent", userAgentString)
|
|
||||||
.timeout(requestTimeout);
|
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
|
||||||
|
.addHeader("User-Agent", userAgentString)
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
HttpFetchResult result = recorder.fetch(client, request);
|
||||||
|
|
||||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||||
robotsParser.parseContent(url.toString(),
|
robotsParser.parseContent(url.toString(),
|
||||||
@@ -430,6 +594,56 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||||
|
if (exception instanceof SocketTimeoutException ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return executionCount < 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||||
|
return switch (response.getCode()) {
|
||||||
|
case 500, 503 -> executionCount < 2;
|
||||||
|
case 429 -> executionCount < 3;
|
||||||
|
default -> false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||||
|
return TimeValue.ofSeconds(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||||
|
|
||||||
|
int statusCode = response.getCode();
|
||||||
|
|
||||||
|
// Give 503 a bit more time
|
||||||
|
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||||
|
|
||||||
|
if (statusCode == 429) {
|
||||||
|
// get the Retry-After header
|
||||||
|
String retryAfter = response.getFirstHeader("Retry-After").getValue();
|
||||||
|
if (retryAfter == null) {
|
||||||
|
return TimeValue.ofSeconds(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||||
|
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||||
|
|
||||||
|
return TimeValue.ofSeconds(retryAfterTime);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return TimeValue.ofSeconds(2);
|
||||||
|
}
|
||||||
|
|
||||||
public static class RateLimitException extends Exception {
|
public static class RateLimitException extends Exception {
|
||||||
private final String retryAfter;
|
private final String retryAfter;
|
||||||
@@ -462,9 +676,10 @@ class SendLock implements AutoCloseable {
|
|||||||
maxConcurrentRequests.acquireUninterruptibly();
|
maxConcurrentRequests.acquireUninterruptibly();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
|
public static <T> T wrapSend(HttpClient client, final ClassicHttpRequest request,
|
||||||
|
final HttpClientResponseHandler<? extends T> responseHandler) throws IOException {
|
||||||
try (var lock = new SendLock()) {
|
try (var lock = new SendLock()) {
|
||||||
return client.send(request, handler);
|
return client.execute(request, responseHandler);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,15 +1,19 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.net.http.HttpHeaders;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.time.Duration;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.time.Instant;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import static nu.marginalia.crawl.fetcher.warc.ErrorBuffer.suppressContentEncoding;
|
||||||
|
|
||||||
/** Input buffer for temporary storage of a HTTP response
|
/** Input buffer for temporary storage of a HTTP response
|
||||||
* This may be in-memory or on-disk, at the discretion of
|
* This may be in-memory or on-disk, at the discretion of
|
||||||
@@ -17,9 +21,9 @@ import java.util.zip.GZIPInputStream;
|
|||||||
* */
|
* */
|
||||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
protected HttpHeaders headers;
|
protected Header[] headers;
|
||||||
|
|
||||||
WarcInputBuffer(HttpHeaders headers) {
|
WarcInputBuffer(Header[] headers) {
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -31,7 +35,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||||
|
|
||||||
public final HttpHeaders headers() { return headers; }
|
public final Header[] headers() { return headers; }
|
||||||
|
|
||||||
/** Create a buffer for a response.
|
/** Create a buffer for a response.
|
||||||
* If the response is small and not compressed, it will be stored in memory.
|
* If the response is small and not compressed, it will be stored in memory.
|
||||||
@@ -39,34 +43,37 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
|
||||||
if (rsp == null)
|
if (response == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
var headers = rsp.headers();
|
|
||||||
|
|
||||||
try (var is = rsp.body()) {
|
var entity = response.getEntity();
|
||||||
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
|
||||||
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
|
||||||
|
|
||||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
if (null == entity) {
|
||||||
|
return new ErrorBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
InputStream is = entity.getContent();
|
||||||
|
long length = entity.getContentLength();
|
||||||
|
|
||||||
|
try (response) {
|
||||||
|
if (length > 0 && length < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(headers, is, contentLength);
|
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(headers, is);
|
return new FileBuffer(response.getHeaders(), timeLimit, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
|
||||||
return new ErrorBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||||
protected void copy(InputStream is, OutputStream os) {
|
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
||||||
long startTime = System.currentTimeMillis();
|
Instant start = Instant.now();
|
||||||
|
Instant timeout = start.plus(timeLimit);
|
||||||
long size = 0;
|
long size = 0;
|
||||||
|
|
||||||
byte[] buffer = new byte[8192];
|
byte[] buffer = new byte[8192];
|
||||||
@@ -76,24 +83,106 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
int n = is.read(buffer);
|
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||||
if (n < 0) break;
|
if (remaining.isNegative()) {
|
||||||
size += n;
|
|
||||||
os.write(buffer, 0, n);
|
|
||||||
|
|
||||||
if (size > WarcRecorder.MAX_SIZE) {
|
|
||||||
truncationReason = WarcTruncationReason.LENGTH;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (System.currentTimeMillis() - startTime > WarcRecorder.MAX_TIME) {
|
|
||||||
truncationReason = WarcTruncationReason.TIME;
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int n = is.read(buffer);
|
||||||
|
|
||||||
|
if (n < 0) break;
|
||||||
|
size += n;
|
||||||
|
|
||||||
|
// Even if we've exceeded the max length,
|
||||||
|
// we keep consuming the stream up until the end or a timeout,
|
||||||
|
// as closing the stream means resetting the connection, and
|
||||||
|
// that's generally not desirable.
|
||||||
|
|
||||||
|
if (size < WarcRecorder.MAX_SIZE) {
|
||||||
|
os.write(buffer, 0, n);
|
||||||
|
}
|
||||||
|
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to close the connection as long as we haven't timed out.
|
||||||
|
// As per Apache HttpClient's semantics, this will reset the connection
|
||||||
|
// and close the stream if we have timed out.
|
||||||
|
|
||||||
|
if (truncationReason != WarcTruncationReason.TIME) {
|
||||||
|
IOUtils.closeQuietly(is);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Takes a Content-Range header and checks if it is complete.
|
||||||
|
* A complete range is one that covers the entire resource.
|
||||||
|
* For example, "bytes 0-1023/2048" or "bytes 0-1023/*" are complete ranges.
|
||||||
|
* "bytes 0-1023/2048" is not a complete range.
|
||||||
|
*/
|
||||||
|
public boolean isRangeComplete(Header[] headers) {
|
||||||
|
// Find the Content-Range header
|
||||||
|
String contentRangeHeader = null;
|
||||||
|
for (var header : headers) {
|
||||||
|
if ("Content-Range".equalsIgnoreCase(header.getName())) {
|
||||||
|
contentRangeHeader = header.getValue();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return true if header is null or empty
|
||||||
|
if (contentRangeHeader == null || contentRangeHeader.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Content-Range format: "bytes range-start-range-end/size"
|
||||||
|
// e.g., "bytes 0-1023/2048" or "bytes 0-1023/*"
|
||||||
|
|
||||||
|
// Get the part after "bytes "
|
||||||
|
String[] parts = contentRangeHeader.split(" ", 2);
|
||||||
|
if (parts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the range and size parts (e.g., "0-1023/2048")
|
||||||
|
String rangeAndSize = parts[1];
|
||||||
|
String[] rangeAndSizeParts = rangeAndSize.split("/", 2);
|
||||||
|
if (rangeAndSizeParts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the range (e.g., "0-1023")
|
||||||
|
String range = rangeAndSizeParts[0];
|
||||||
|
String[] rangeParts = range.split("-", 2);
|
||||||
|
if (rangeParts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the size (e.g., "2048" or "*")
|
||||||
|
String size = rangeAndSizeParts[1];
|
||||||
|
|
||||||
|
// If size is "*", we don't know the total size, so return false
|
||||||
|
if ("*".equals(size)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse as long to handle large files
|
||||||
|
long rangeStart = Long.parseLong(rangeParts[0]);
|
||||||
|
long rangeEnd = Long.parseLong(rangeParts[1]);
|
||||||
|
long totalSize = Long.parseLong(size);
|
||||||
|
|
||||||
|
// Check if the range covers the entire resource
|
||||||
|
return rangeStart == 0 && rangeEnd == totalSize - 1;
|
||||||
|
|
||||||
|
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -101,7 +190,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
/** Pseudo-buffer for when we have an error */
|
/** Pseudo-buffer for when we have an error */
|
||||||
class ErrorBuffer extends WarcInputBuffer {
|
class ErrorBuffer extends WarcInputBuffer {
|
||||||
public ErrorBuffer() {
|
public ErrorBuffer() {
|
||||||
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
super(new Header[0]);
|
||||||
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
@@ -118,17 +207,29 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {}
|
public void close() throws Exception {}
|
||||||
|
|
||||||
|
|
||||||
|
static Header[] suppressContentEncoding(Header[] headers) {
|
||||||
|
return Arrays.stream(headers).filter(header -> !"Content-Encoding".equalsIgnoreCase(header.getName())).toArray(Header[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
|
||||||
super(headers);
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
|
if (!isRangeComplete(headers)) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
} else {
|
||||||
|
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
|
}
|
||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(responseStream, outputStream);
|
copy(responseStream, outputStream, timeLimit);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@@ -152,40 +253,25 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(headers));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
|
if (!isRangeComplete(headers)) {
|
||||||
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
} else {
|
||||||
|
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
|
}
|
||||||
|
|
||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
|
|
||||||
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
copy(responseStream, out, timeLimit);
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
|
||||||
copy(new GZIPInputStream(responseStream), out);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
catch (Exception ex) {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
copy(responseStream, out);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
|
||||||
return HttpHeaders.of(headers.map(), (k, v) -> {
|
|
||||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public InputStream read() throws IOException {
|
public InputStream read() throws IOException {
|
||||||
return Files.newInputStream(tempFile);
|
return Files.newInputStream(tempFile);
|
||||||
}
|
}
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
@@ -17,7 +19,7 @@ import java.util.stream.Collectors;
|
|||||||
public class WarcProtocolReconstructor {
|
public class WarcProtocolReconstructor {
|
||||||
|
|
||||||
static String getHttpRequestString(String method,
|
static String getHttpRequestString(String method,
|
||||||
Map<String, List<String>> mainHeaders,
|
Header[] mainHeaders,
|
||||||
Map<String, List<String>> extraHeaders,
|
Map<String, List<String>> extraHeaders,
|
||||||
URI uri) {
|
URI uri) {
|
||||||
StringBuilder requestStringBuilder = new StringBuilder();
|
StringBuilder requestStringBuilder = new StringBuilder();
|
||||||
@@ -34,12 +36,13 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
Set<String> addedHeaders = new HashSet<>();
|
Set<String> addedHeaders = new HashSet<>();
|
||||||
|
|
||||||
mainHeaders.forEach((k, values) -> {
|
for (var header : mainHeaders) {
|
||||||
for (var value : values) {
|
String k = header.getName();
|
||||||
addedHeaders.add(k);
|
String v = header.getValue();
|
||||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
|
||||||
}
|
addedHeaders.add(k);
|
||||||
});
|
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(v).append("\r\n");
|
||||||
|
}
|
||||||
|
|
||||||
extraHeaders.forEach((k, values) -> {
|
extraHeaders.forEach((k, values) -> {
|
||||||
if (!addedHeaders.contains(k)) {
|
if (!addedHeaders.contains(k)) {
|
||||||
@@ -87,6 +90,12 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||||
|
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||||
|
|
||||||
|
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||||
|
}
|
||||||
|
|
||||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
||||||
Map.entry(200, "OK"),
|
Map.entry(200, "OK"),
|
||||||
Map.entry(201, "Created"),
|
Map.entry(201, "Created"),
|
||||||
@@ -149,6 +158,37 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||||
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
for (var header : headers) {
|
||||||
|
String headerCapitalized = capitalizeHeader(header.getName());
|
||||||
|
|
||||||
|
// Omit pseudoheaders injected by the crawler itself
|
||||||
|
if (headerCapitalized.startsWith("X-Marginalia"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Omit Transfer-Encoding and Content-Encoding headers
|
||||||
|
if (headerCapitalized.equals("Transfer-Encoding"))
|
||||||
|
continue;
|
||||||
|
if (headerCapitalized.equals("Content-Encoding"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||||
|
// to reflect the actual size of the response body. We'll do this at the end.
|
||||||
|
if (headerCapitalized.equals("Content-Length"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
joiner.add("Content-Length: " + responseSize);
|
||||||
|
|
||||||
|
return joiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
|
@@ -1,11 +1,17 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||||
|
import org.apache.hc.core5.http.NameValuePair;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -14,10 +20,9 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
|
import java.net.SocketTimeoutException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpResponse;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@@ -48,7 +53,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
// Affix a version string in case we need to change the format in the future
|
// Affix a version string in case we need to change the format in the future
|
||||||
// in some way
|
// in some way
|
||||||
private final String warcRecorderVersion = "1.0";
|
private final String warcRecorderVersion = "1.0";
|
||||||
private final Cookies cookies;
|
private final CookieStore cookies;
|
||||||
|
private final LinkParser linkParser = new LinkParser();
|
||||||
/**
|
/**
|
||||||
* Create a new WarcRecorder that will write to the given file
|
* Create a new WarcRecorder that will write to the given file
|
||||||
*
|
*
|
||||||
@@ -60,7 +66,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
this.cookies = fetcher.getCookies();
|
this.cookies = fetcher.getCookies();
|
||||||
}
|
}
|
||||||
|
|
||||||
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
|
||||||
this.warcFile = warcFile;
|
this.warcFile = warcFile;
|
||||||
this.writer = new WarcWriter(warcFile);
|
this.writer = new WarcWriter(warcFile);
|
||||||
this.cookies = cookies;
|
this.cookies = cookies;
|
||||||
@@ -73,16 +79,28 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
public WarcRecorder() throws IOException {
|
public WarcRecorder() throws IOException {
|
||||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||||
this.writer = new WarcWriter(this.warcFile);
|
this.writer = new WarcWriter(this.warcFile);
|
||||||
this.cookies = new Cookies();
|
this.cookies = new BasicCookieStore();
|
||||||
|
|
||||||
temporaryFile = true;
|
temporaryFile = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasCookies() {
|
||||||
|
return !cookies.getCookies().isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(HttpClient client,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
java.net.http.HttpRequest request)
|
ClassicHttpRequest request)
|
||||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
{
|
{
|
||||||
URI requestUri = request.uri();
|
return fetch(client, request, Duration.ofMillis(MAX_TIME));
|
||||||
|
}
|
||||||
|
|
||||||
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
|
ClassicHttpRequest request,
|
||||||
|
Duration timeout)
|
||||||
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
|
{
|
||||||
|
URI requestUri = request.getUri();
|
||||||
|
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
@@ -90,121 +108,148 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
Instant date = Instant.now();
|
Instant date = Instant.now();
|
||||||
|
|
||||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.headers().map());
|
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||||
|
|
||||||
|
// Inject a range header to attempt to limit the size of the response
|
||||||
|
// to the maximum size we want to store, if the server supports it.
|
||||||
|
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||||
|
|
||||||
HttpResponse<InputStream> response;
|
|
||||||
try {
|
try {
|
||||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
return client.execute(request, response -> {
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
InputStream inputStream = inputBuffer.read()) {
|
||||||
|
|
||||||
|
// Build and write the request
|
||||||
|
|
||||||
|
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
|
byte[] httpRequestString = WarcProtocolReconstructor
|
||||||
|
.getHttpRequestString(
|
||||||
|
request.getMethod(),
|
||||||
|
request.getHeaders(),
|
||||||
|
extraHeaders,
|
||||||
|
requestUri)
|
||||||
|
.getBytes();
|
||||||
|
|
||||||
|
requestDigestBuilder.update(httpRequestString);
|
||||||
|
|
||||||
|
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||||
|
.blockDigest(requestDigestBuilder.build())
|
||||||
|
.date(date)
|
||||||
|
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcRequest);
|
||||||
|
|
||||||
|
if (hasCookies()) {
|
||||||
|
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
|
|
||||||
|
responseDataBuffer.put(responseHeaders);
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||||
|
|
||||||
|
int dataStart = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
int remainingLength = responseDataBuffer.remaining();
|
||||||
|
if (remainingLength == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
int startPos = responseDataBuffer.pos();
|
||||||
|
|
||||||
|
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||||
|
if (n < 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||||
|
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// with some http client libraries, that resolve redirects transparently, this might be different
|
||||||
|
// from the request URI, but currently we don't have transparent redirect resolution so it's always
|
||||||
|
// the same (though let's keep the variables separate in case this changes)
|
||||||
|
final URI responseUri = requestUri;
|
||||||
|
|
||||||
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
|
.blockDigest(responseDigestBuilder.build())
|
||||||
|
.date(date)
|
||||||
|
.concurrentTo(warcRequest.id())
|
||||||
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
|
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||||
|
responseBuilder.ipAddress(inetAddress);
|
||||||
|
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||||
|
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||||
|
|
||||||
|
// Build and write the response
|
||||||
|
|
||||||
|
var warcResponse = responseBuilder.build();
|
||||||
|
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
|
writer.write(warcResponse);
|
||||||
|
|
||||||
|
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
|
&& inputBuffer.size() < 2048
|
||||||
|
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
|
{
|
||||||
|
// Fast detection and mitigation of crawler traps that respond with slow
|
||||||
|
// small responses, with a high branching factor
|
||||||
|
|
||||||
|
// Note we bail *after* writing the warc records, this will effectively only
|
||||||
|
// prevent link extraction from the document.
|
||||||
|
|
||||||
|
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||||
|
requestUri,
|
||||||
|
Duration.between(date, Instant.now()).getSeconds(),
|
||||||
|
inputBuffer.size()
|
||||||
|
);
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response.getCode() == 301 || response.getCode() == 302 || response.getCode() == 307) {
|
||||||
|
// If the server responds with a redirect, we need to
|
||||||
|
// update the request URI to the new location
|
||||||
|
EdgeUrl redirectLocation = Optional.ofNullable(response.getFirstHeader("Location"))
|
||||||
|
.map(NameValuePair::getValue)
|
||||||
|
.flatMap(location -> linkParser.parseLink(new EdgeUrl(requestUri), location))
|
||||||
|
.orElse(null);
|
||||||
|
if (redirectLocation != null) {
|
||||||
|
// If the redirect location is a valid URL, we need to update the request URI
|
||||||
|
return new HttpFetchResult.ResultRedirect(redirectLocation);
|
||||||
|
} else {
|
||||||
|
// If the redirect location is not a valid URL, we need to throw an exception
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Invalid redirect location: " + response.getFirstHeader("Location")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
|
response.getCode(),
|
||||||
|
inputBuffer.headers(),
|
||||||
|
inetAddress.getHostAddress(),
|
||||||
|
responseDataBuffer.data,
|
||||||
|
dataStart,
|
||||||
|
responseDataBuffer.length() - dataStart);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// the client.execute() method will throw an exception if the request times out
|
||||||
|
// or on other IO exceptions, so we need to catch those here as well as having
|
||||||
|
// exception handling in the response handler
|
||||||
|
} catch (SocketTimeoutException ex) {
|
||||||
|
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
} catch (IOException ex) {
|
||||||
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
|
|
||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response);
|
|
||||||
InputStream inputStream = inputBuffer.read())
|
|
||||||
{
|
|
||||||
if (cookies.hasCookies()) {
|
|
||||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
|
||||||
|
|
||||||
responseDataBuffer.put(responseHeaders);
|
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
|
||||||
|
|
||||||
int dataStart = responseDataBuffer.pos();
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
int remainingLength = responseDataBuffer.remaining();
|
|
||||||
if (remainingLength == 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
int startPos = responseDataBuffer.pos();
|
|
||||||
|
|
||||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
|
||||||
if (n < 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
|
||||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
// It looks like this might be the same as requestUri, but it's not;
|
|
||||||
// it's the URI after resolving redirects.
|
|
||||||
final URI responseUri = response.uri();
|
|
||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
|
||||||
.blockDigest(responseDigestBuilder.build())
|
|
||||||
.date(date)
|
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
|
||||||
|
|
||||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
|
||||||
responseBuilder.ipAddress(inetAddress);
|
|
||||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
|
||||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
|
||||||
|
|
||||||
// Build and write the response
|
|
||||||
|
|
||||||
var warcResponse = responseBuilder.build();
|
|
||||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
|
||||||
writer.write(warcResponse);
|
|
||||||
|
|
||||||
// Build and write the request
|
|
||||||
|
|
||||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
|
||||||
|
|
||||||
byte[] httpRequestString = WarcProtocolReconstructor
|
|
||||||
.getHttpRequestString(
|
|
||||||
response.request().method(),
|
|
||||||
response.request().headers().map(),
|
|
||||||
extraHeaders,
|
|
||||||
requestUri)
|
|
||||||
.getBytes();
|
|
||||||
|
|
||||||
requestDigestBuilder.update(httpRequestString);
|
|
||||||
|
|
||||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
|
||||||
.blockDigest(requestDigestBuilder.build())
|
|
||||||
.date(date)
|
|
||||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
|
||||||
.concurrentTo(warcResponse.id())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
|
||||||
writer.write(warcRequest);
|
|
||||||
|
|
||||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
|
||||||
&& inputBuffer.size() < 2048
|
|
||||||
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
|
||||||
{
|
|
||||||
// Fast detection and mitigation of crawler traps that respond with slow
|
|
||||||
// small responses, with a high branching factor
|
|
||||||
|
|
||||||
// Note we bail *after* writing the warc records, this will effectively only
|
|
||||||
// prevent link extraction from the document.
|
|
||||||
|
|
||||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
|
||||||
requestUri,
|
|
||||||
Duration.between(date, Instant.now()).getSeconds(),
|
|
||||||
inputBuffer.size()
|
|
||||||
);
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
|
||||||
}
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultOk(responseUri,
|
|
||||||
response.statusCode(),
|
|
||||||
inputBuffer.headers(),
|
|
||||||
inetAddress.getHostAddress(),
|
|
||||||
responseDataBuffer.data,
|
|
||||||
dataStart,
|
|
||||||
responseDataBuffer.length() - dataStart);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
}
|
}
|
||||||
@@ -275,7 +320,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
.date(Instant.now())
|
.date(Instant.now())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
if (cookies.hasCookies()) {
|
if (hasCookies()) {
|
||||||
builder.addHeader("X-Has-Cookies", "1");
|
builder.addHeader("X-Has-Cookies", "1");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -316,6 +361,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
||||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||||
break;
|
break;
|
||||||
|
case HttpFetcher.DomainProbeResult.RedirectSameDomain_Internal redirectSameDomain:
|
||||||
|
fields.put("X-WARC-Probe-Status", List.of("REDIR-INTERNAL"));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
var warcinfo = new Warcinfo.Builder()
|
var warcinfo = new Warcinfo.Builder()
|
||||||
|
@@ -51,6 +51,10 @@ public class CrawlDelayTimer {
|
|||||||
waitFetchDelay(0);
|
waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void waitFetchDelay(Duration spentTime) {
|
||||||
|
waitFetchDelay(spentTime.toMillis());
|
||||||
|
}
|
||||||
|
|
||||||
public void waitFetchDelay(long spentTime) {
|
public void waitFetchDelay(long spentTime) {
|
||||||
long sleepTime = delayTime;
|
long sleepTime = delayTime;
|
||||||
|
|
||||||
|
@@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used to stagger the rate at which connections are created.
|
||||||
|
* <p></p>
|
||||||
|
* It is used to ensure that we do not create too many connections at once,
|
||||||
|
* which can lead to network congestion and other issues. Since the connections
|
||||||
|
* tend to be very long-lived, we can afford to wait a bit before creating the next
|
||||||
|
* even if it adds a bit of build-up time when the crawl starts.
|
||||||
|
*/
|
||||||
|
public class CrawlerConnectionThrottle {
|
||||||
|
private Instant lastCrawlStart = Instant.EPOCH;
|
||||||
|
private final Semaphore launchSemaphore = new Semaphore(1);
|
||||||
|
|
||||||
|
private final Duration launchInterval;
|
||||||
|
|
||||||
|
public CrawlerConnectionThrottle(Duration launchInterval) {
|
||||||
|
this.launchInterval = launchInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void waitForConnectionPermission() throws InterruptedException {
|
||||||
|
try {
|
||||||
|
launchSemaphore.acquire();
|
||||||
|
Instant nextPermittedLaunch = lastCrawlStart.plus(launchInterval);
|
||||||
|
|
||||||
|
if (nextPermittedLaunch.isAfter(Instant.now())) {
|
||||||
|
long waitTime = Duration.between(Instant.now(), nextPermittedLaunch).toMillis();
|
||||||
|
TimeUnit.MILLISECONDS.sleep(waitTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCrawlStart = Instant.now();
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
launchSemaphore.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -7,7 +7,6 @@ import nu.marginalia.crawl.CrawlerMain;
|
|||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
@@ -29,13 +28,13 @@ import java.nio.file.Path;
|
|||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawlerRetreiver implements AutoCloseable {
|
public class CrawlerRetreiver implements AutoCloseable {
|
||||||
|
|
||||||
private static final int MAX_ERRORS = 20;
|
private static final int MAX_ERRORS = 20;
|
||||||
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
|
|
||||||
|
|
||||||
private final HttpFetcher fetcher;
|
private final HttpFetcher fetcher;
|
||||||
|
|
||||||
@@ -53,6 +52,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
|
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||||
|
Duration.ofMillis(50) // pace the connections to avoid network congestion at startup
|
||||||
|
);
|
||||||
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
@@ -92,6 +95,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
try (oldCrawlData) {
|
try (oldCrawlData) {
|
||||||
|
|
||||||
|
// Wait for permission to open a connection to avoid network congestion
|
||||||
|
// from hundreds/thousands of TCP handshakes
|
||||||
|
connectionThrottle.waitForConnectionPermission();
|
||||||
|
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
var probeResult = probeRootUrl();
|
var probeResult = probeRootUrl();
|
||||||
|
|
||||||
@@ -137,6 +145,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
yield 1;
|
yield 1;
|
||||||
}
|
}
|
||||||
|
default -> {
|
||||||
|
logger.error("Unexpected domain probe result {}", probeResult);
|
||||||
|
yield 1;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -160,7 +172,14 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
// Fetch sitemaps
|
// Fetch sitemaps
|
||||||
for (var sitemap : robotsRules.getSitemaps()) {
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
|
||||||
|
// Validate the sitemap URL and check if it belongs to the domain as the root URL
|
||||||
|
if (EdgeUrl.parse(sitemap)
|
||||||
|
.map(url -> url.getDomain().equals(rootUrl.domain))
|
||||||
|
.orElse(false)) {
|
||||||
|
|
||||||
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int crawlerAdditions = 0;
|
int crawlerAdditions = 0;
|
||||||
@@ -247,17 +266,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return domainProbeResult;
|
return domainProbeResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
Optional<String> feedLink = Optional.empty();
|
Optional<String> feedLink = Optional.empty();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||||
|
if (Objects.equals(location.domain, url.domain)) {
|
||||||
|
// TODO: Follow the redirect to the new location and sniff the document
|
||||||
|
crawlFrontier.addFirst(location);
|
||||||
|
}
|
||||||
|
|
||||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
|
||||||
var optDoc = ok.parseDocument();
|
var optDoc = ok.parseDocument();
|
||||||
if (optDoc.isEmpty())
|
if (optDoc.isEmpty())
|
||||||
@@ -306,7 +337,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
|
|
||||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||||
String contentType = iconResult.header("Content-Type");
|
String contentType = iconResult.header("Content-Type");
|
||||||
byte[] iconData = iconResult.getBodyBytes();
|
byte[] iconData = iconResult.getBodyBytes();
|
||||||
|
|
||||||
@@ -376,7 +407,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
if (parsedOpt.isEmpty())
|
if (parsedOpt.isEmpty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
@@ -402,112 +433,63 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
CrawlDelayTimer timer,
|
CrawlDelayTimer timer,
|
||||||
DocumentWithReference reference) throws InterruptedException
|
DocumentWithReference reference) throws InterruptedException
|
||||||
{
|
{
|
||||||
logger.debug("Fetching {}", top);
|
|
||||||
|
|
||||||
long startTime = System.currentTimeMillis();
|
|
||||||
var contentTags = reference.getContentTags();
|
var contentTags = reference.getContentTags();
|
||||||
|
|
||||||
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||||
|
timer.waitFetchDelay();
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the document and enqueue links
|
// Parse the document and enqueue links
|
||||||
try {
|
try {
|
||||||
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
|
switch (fetchedDoc) {
|
||||||
var docOpt = ok.parseDocument();
|
case HttpFetchResult.ResultOk ok -> {
|
||||||
if (docOpt.isPresent()) {
|
var docOpt = ok.parseDocument();
|
||||||
var doc = docOpt.get();
|
if (docOpt.isPresent()) {
|
||||||
|
var doc = docOpt.get();
|
||||||
|
|
||||||
var responseUrl = new EdgeUrl(ok.uri());
|
var responseUrl = new EdgeUrl(ok.uri());
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||||
crawlFrontier.addVisited(responseUrl);
|
crawlFrontier.addVisited(responseUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
case HttpFetchResult.Result304Raw ref when reference.doc() != null ->
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
{
|
||||||
var doc = reference.doc();
|
var doc = reference.doc();
|
||||||
|
|
||||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||||
|
|
||||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||||
new ContentType(doc.contentType, "UTF-8"),
|
new ContentType(doc.contentType, "UTF-8"),
|
||||||
doc.documentBodyBytes);
|
doc.documentBodyBytes);
|
||||||
|
|
||||||
if (doc.documentBodyBytes != null) {
|
if (doc.documentBodyBytes != null) {
|
||||||
var parsed = doc.parseBody();
|
var parsed = doc.parseBody();
|
||||||
|
|
||||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||||
crawlFrontier.addVisited(top);
|
crawlFrontier.addVisited(top);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
case HttpFetchResult.ResultRedirect(EdgeUrl location) -> {
|
||||||
else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
|
if (Objects.equals(location.domain, top.domain)) {
|
||||||
errorCount ++;
|
crawlFrontier.addFirst(location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case HttpFetchResult.ResultException ex -> errorCount++;
|
||||||
|
default -> {} // Ignore other types
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error parsing document {}", top, ex);
|
logger.error("Error parsing document {}", top, ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
|
|
||||||
|
|
||||||
return fetchedDoc;
|
return fetchedDoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Fetch a document and retry on 429s */
|
|
||||||
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
|
||||||
CrawlDelayTimer timer,
|
|
||||||
HttpFetcher.ProbeType probeType,
|
|
||||||
ContentTags contentTags) throws InterruptedException {
|
|
||||||
|
|
||||||
long probeStart = System.currentTimeMillis();
|
|
||||||
|
|
||||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
|
||||||
retryLoop:
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
|
||||||
|
|
||||||
switch (probeResult) {
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
|
||||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
|
||||||
break retryLoop;
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
|
||||||
return new HttpFetchResult.ResultNone();
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
|
||||||
return new HttpFetchResult.ResultException(timeout.ex());
|
|
||||||
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
|
||||||
return new HttpFetchResult.ResultException(exception.ex());
|
|
||||||
default: // should be unreachable
|
|
||||||
throw new IllegalStateException("Unknown probe result");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", url, ex);
|
|
||||||
return new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
|
||||||
}
|
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", url, ex);
|
|
||||||
return new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new HttpFetchResult.ResultNone();
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isAllowedProtocol(String proto) {
|
private boolean isAllowedProtocol(String proto) {
|
||||||
return proto.equalsIgnoreCase("http")
|
return proto.equalsIgnoreCase("http")
|
||||||
|| proto.equalsIgnoreCase("https");
|
|| proto.equalsIgnoreCase("https");
|
||||||
|
@@ -55,6 +55,9 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgeDomain getDomain() {
|
||||||
|
return thisDomain;
|
||||||
|
}
|
||||||
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
||||||
* than the number of already visited documents, the base depth will be adjusted
|
* than the number of already visited documents, the base depth will be adjusted
|
||||||
* to the visited count first.
|
* to the visited count first.
|
||||||
|
@@ -10,6 +10,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@@ -18,10 +20,13 @@ import java.io.IOException;
|
|||||||
* E-Tag and Last-Modified headers.
|
* E-Tag and Last-Modified headers.
|
||||||
*/
|
*/
|
||||||
public class CrawlerRevisitor {
|
public class CrawlerRevisitor {
|
||||||
|
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
private final CrawlerRetreiver crawlerRetreiver;
|
private final CrawlerRetreiver crawlerRetreiver;
|
||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerRevisitor.class);
|
||||||
|
|
||||||
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
||||||
CrawlerRetreiver crawlerRetreiver,
|
CrawlerRetreiver crawlerRetreiver,
|
||||||
WarcRecorder warcRecorder) {
|
WarcRecorder warcRecorder) {
|
||||||
@@ -151,11 +156,13 @@ public class CrawlerRevisitor {
|
|||||||
else if (result instanceof HttpFetchResult.ResultException) {
|
else if (result instanceof HttpFetchResult.ResultException) {
|
||||||
errors++;
|
errors++;
|
||||||
}
|
}
|
||||||
|
|
||||||
recrawled++;
|
recrawled++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info("Recrawl summary {}: {} recrawled, {} retained, {} errors, {} skipped",
|
||||||
|
crawlFrontier.getDomain(), recrawled, retained, errors, skipped);
|
||||||
|
|
||||||
return new RecrawlMetadata(size, errors, skipped);
|
return new RecrawlMetadata(size, errors, skipped);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.model.body.HttpFetchResult;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public record DocumentWithReference(
|
public record DocumentWithReference(
|
||||||
@Nullable CrawledDocument doc,
|
@Nullable CrawledDocument doc,
|
||||||
@@ -33,8 +34,22 @@ public record DocumentWithReference(
|
|||||||
return false;
|
return false;
|
||||||
if (doc == null)
|
if (doc == null)
|
||||||
return false;
|
return false;
|
||||||
if (doc.documentBodyBytes.length == 0)
|
if (doc.documentBodyBytes.length == 0) {
|
||||||
return false;
|
if (doc.httpStatus < 300) {
|
||||||
|
return resultOk.bytesLength() == 0;
|
||||||
|
}
|
||||||
|
else if (doc.httpStatus == 301 || doc.httpStatus == 302 || doc.httpStatus == 307) {
|
||||||
|
@Nullable
|
||||||
|
String docLocation = doc.getHeader("Location");
|
||||||
|
@Nullable
|
||||||
|
String resultLocation = resultOk.header("Location");
|
||||||
|
|
||||||
|
return Objects.equals(docLocation, resultLocation);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return doc.httpStatus == resultOk.statusCode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
||||||
}
|
}
|
||||||
|
@@ -41,6 +41,8 @@ dependencies {
|
|||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation libs.mockito
|
testImplementation libs.mockito
|
||||||
|
@@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.model.body;
|
package nu.marginalia.model.body;
|
||||||
|
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
|
import org.apache.hc.core5.http.message.BasicHeader;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@@ -11,8 +14,10 @@ import java.io.ByteArrayInputStream;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpHeaders;
|
import java.util.ArrayList;
|
||||||
import java.util.*;
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
*/
|
*/
|
||||||
@@ -56,7 +61,7 @@ public sealed interface HttpFetchResult {
|
|||||||
*/
|
*/
|
||||||
record ResultOk(URI uri,
|
record ResultOk(URI uri,
|
||||||
int statusCode,
|
int statusCode,
|
||||||
HttpHeaders headers,
|
Header[] headers,
|
||||||
String ipAddress,
|
String ipAddress,
|
||||||
byte[] bytesRaw, // raw data for the entire response including headers
|
byte[] bytesRaw, // raw data for the entire response including headers
|
||||||
int bytesStart,
|
int bytesStart,
|
||||||
@@ -67,18 +72,19 @@ public sealed interface HttpFetchResult {
|
|||||||
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
private static Header[] convertHeaders(MessageHeaders messageHeaders) {
|
||||||
Map<String, List<String>> inputMap = messageHeaders.map();
|
List<Header> headers = new ArrayList<>(12);
|
||||||
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
|
||||||
|
|
||||||
inputMap.forEach((k, v) -> {
|
messageHeaders.map().forEach((k, v) -> {
|
||||||
if (k.isBlank()) return;
|
if (k.isBlank()) return;
|
||||||
if (!Character.isAlphabetic(k.charAt(0))) return;
|
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||||
|
|
||||||
filteredMap.put(k, v);
|
for (var value : v) {
|
||||||
|
headers.add(new BasicHeader(k, value));
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
return headers.toArray(new Header[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
@@ -108,7 +114,13 @@ public sealed interface HttpFetchResult {
|
|||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public String header(String name) {
|
public String header(String name) {
|
||||||
return headers.firstValue(name).orElse(null);
|
for (var header : headers) {
|
||||||
|
if (header.getName().equalsIgnoreCase(name)) {
|
||||||
|
String headerValue = header.getValue();
|
||||||
|
return headerValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -132,6 +144,12 @@ public sealed interface HttpFetchResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
record ResultRedirect(EdgeUrl url) implements HttpFetchResult {
|
||||||
|
public boolean isOk() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Fetching resulted in a HTTP 304, the remote content is identical to
|
/** Fetching resulted in a HTTP 304, the remote content is identical to
|
||||||
* our reference copy. This will be replaced with a Result304ReplacedWithReference
|
* our reference copy. This will be replaced with a Result304ReplacedWithReference
|
||||||
* at a later stage.
|
* at a later stage.
|
||||||
|
@@ -102,7 +102,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
private String getHeader(String header) {
|
public String getHeader(String header) {
|
||||||
if (headers == null) {
|
if (headers == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@@ -165,12 +165,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean hasCookies = false;
|
||||||
|
String etag = null;
|
||||||
|
String lastModified = null;
|
||||||
|
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
for (var header : headers.map().entrySet()) {
|
for (var header : headers) {
|
||||||
for (var value : header.getValue()) {
|
if (header.getName().equalsIgnoreCase("X-Has-Cookies")) {
|
||||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
hasCookies = hasCookies || header.getValue().equals("1");
|
||||||
|
}
|
||||||
|
else if (header.getName().equalsIgnoreCase("ETag")) {
|
||||||
|
etag = header.getValue();
|
||||||
|
}
|
||||||
|
else if (header.getName().equalsIgnoreCase("Last-Modified")) {
|
||||||
|
lastModified = header.getValue();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
String headersStr = headersStrBuilder.toString();
|
String headersStr = headersStrBuilder.toString();
|
||||||
|
|
||||||
|
|
||||||
@@ -178,14 +192,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
domain,
|
domain,
|
||||||
response.target(),
|
response.target(),
|
||||||
fetchOk.ipAddress(),
|
fetchOk.ipAddress(),
|
||||||
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
hasCookies,
|
||||||
fetchOk.statusCode(),
|
fetchOk.statusCode(),
|
||||||
response.date(),
|
response.date(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
headersStr,
|
headersStr,
|
||||||
headers.firstValue("ETag").orElse(null),
|
etag,
|
||||||
headers.firstValue("Last-Modified").orElse(null)
|
lastModified
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -341,12 +341,15 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean hasCookies = false;
|
||||||
|
|
||||||
String headersStr;
|
String headersStr;
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
for (var header : headers.map().entrySet()) {
|
for (var header : headers) {
|
||||||
for (var value : header.getValue()) {
|
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
hasCookies = true;
|
||||||
}
|
}
|
||||||
|
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
headersStr = headersStrBuilder.toString();
|
headersStr = headersStrBuilder.toString();
|
||||||
|
|
||||||
@@ -355,7 +358,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
domain,
|
domain,
|
||||||
response.target(),
|
response.target(),
|
||||||
fetchOk.ipAddress(),
|
fetchOk.ipAddress(),
|
||||||
"1".equals(headers.firstValue("X-Cookies").orElse("0")),
|
hasCookies,
|
||||||
fetchOk.statusCode(),
|
fetchOk.statusCode(),
|
||||||
response.date().toEpochMilli(),
|
response.date().toEpochMilli(),
|
||||||
contentType,
|
contentType,
|
||||||
|
@@ -0,0 +1,140 @@
|
|||||||
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class HttpFetcherImplContentTypeProbeTest {
|
||||||
|
|
||||||
|
private HttpFetcherImpl fetcher;
|
||||||
|
private static WireMockServer wireMockServer;
|
||||||
|
|
||||||
|
private static EdgeUrl timeoutUrl;
|
||||||
|
private static EdgeUrl contentTypeHtmlUrl;
|
||||||
|
private static EdgeUrl contentTypeBinaryUrl;
|
||||||
|
private static EdgeUrl redirectUrl;
|
||||||
|
private static EdgeUrl badHttpStatusUrl;
|
||||||
|
private static EdgeUrl onlyGetAllowedUrl;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setupAll() throws URISyntaxException {
|
||||||
|
wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||||
|
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||||
|
|
||||||
|
contentTypeHtmlUrl = new EdgeUrl("http://localhost:18089/test.html.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeHtmlUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(200)));
|
||||||
|
|
||||||
|
contentTypeBinaryUrl = new EdgeUrl("http://localhost:18089/test.bad.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeBinaryUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "application/octet-stream")
|
||||||
|
.withStatus(200)));
|
||||||
|
|
||||||
|
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||||
|
.withStatus(301)));
|
||||||
|
|
||||||
|
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(500)));
|
||||||
|
|
||||||
|
onlyGetAllowedUrl = new EdgeUrl("http://localhost:18089/onlyget.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withStatus(405))); // Method Not Allowed
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(200)));
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void tearDownAll() {
|
||||||
|
wireMockServer.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
fetcher.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||||
|
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||||
|
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeHtml() {
|
||||||
|
var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeBinary() {
|
||||||
|
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeRedirect() {
|
||||||
|
var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeContentTypeBadHttpStatus() {
|
||||||
|
var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOnlyGetAllowed() {
|
||||||
|
var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTimeout() {
|
||||||
|
var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -0,0 +1,89 @@
|
|||||||
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class HttpFetcherImplDomainProbeTest {
|
||||||
|
|
||||||
|
private HttpFetcherImpl fetcher;
|
||||||
|
private static WireMockServer wireMockServer;
|
||||||
|
|
||||||
|
private static EdgeUrl timeoutUrl;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setupAll() throws URISyntaxException {
|
||||||
|
wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo("/timeout"))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||||
|
|
||||||
|
wireMockServer.start();
|
||||||
|
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void tearDownAll() {
|
||||||
|
wireMockServer.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
fetcher.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomain() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeDomain(new EdgeUrl("https://www.marginalia.nu/"));
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomainProtoUpgrade() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeDomain(new EdgeUrl("http://www.marginalia.nu/"));
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomainRedirect() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeDomain(new EdgeUrl("http://search.marginalia.nu/"));
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Redirect(new EdgeDomain("marginalia-search.com")), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomainOnlyGET() throws URISyntaxException {
|
||||||
|
// This test is to check if the domain probe only allows GET requests
|
||||||
|
var result = fetcher.probeDomain(new EdgeUrl("https://marginalia-search.com/"));
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://marginalia-search.com/")), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomainError() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeDomain(new EdgeUrl("https://invalid.example.com/"));
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe"), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testProbeDomainTimeout() throws URISyntaxException {
|
||||||
|
var result = fetcher.probeDomain(timeoutUrl);
|
||||||
|
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe"), result);
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,330 @@
|
|||||||
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
|
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||||
|
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||||
|
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.netpreserve.jwarc.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class HttpFetcherImplFetchTest {
|
||||||
|
|
||||||
|
private HttpFetcherImpl fetcher;
|
||||||
|
private static WireMockServer wireMockServer;
|
||||||
|
|
||||||
|
private static String etag = "etag";
|
||||||
|
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
|
||||||
|
|
||||||
|
private static EdgeUrl okUrl;
|
||||||
|
private static EdgeUrl okRangeResponseUrl;
|
||||||
|
private static EdgeUrl okUrlWith304;
|
||||||
|
|
||||||
|
private static EdgeUrl timeoutUrl;
|
||||||
|
private static EdgeUrl redirectUrl;
|
||||||
|
private static EdgeUrl badHttpStatusUrl;
|
||||||
|
private static EdgeUrl keepAliveUrl;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setupAll() throws URISyntaxException {
|
||||||
|
wireMockServer =
|
||||||
|
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||||
|
.port(18089));
|
||||||
|
|
||||||
|
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||||
|
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withFixedDelay(15000)
|
||||||
|
)); // 15 seconds delay to simulate timeout
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(timeoutUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withFixedDelay(15000)
|
||||||
|
.withBody("Hello World")
|
||||||
|
)); // 15 seconds delay to simulate timeout
|
||||||
|
|
||||||
|
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||||
|
.withStatus(301)));
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(redirectUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||||
|
.withStatus(301)));
|
||||||
|
|
||||||
|
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(500)));
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(500)));
|
||||||
|
|
||||||
|
okUrl = new EdgeUrl("http://localhost:18089/ok.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(200)));
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(200)
|
||||||
|
.withBody("Hello World")));
|
||||||
|
|
||||||
|
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withHeader("ETag", etag)
|
||||||
|
.withHeader("Last-Modified", lastModified)
|
||||||
|
.withStatus(304)));
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlWith304.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withHeader("ETag", etag)
|
||||||
|
.withHeader("Last-Modified", lastModified)
|
||||||
|
.withStatus(304)));
|
||||||
|
|
||||||
|
okRangeResponseUrl = new EdgeUrl("http://localhost:18089/okRangeResponse.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okRangeResponseUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Range", "bytes 0-100/200")
|
||||||
|
.withBody("Hello World")
|
||||||
|
.withStatus(206)));
|
||||||
|
|
||||||
|
keepAliveUrl = new EdgeUrl("http://localhost:18089/keepalive.bin");
|
||||||
|
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(keepAliveUrl.path))
|
||||||
|
.willReturn(WireMock.aResponse()
|
||||||
|
.withHeader("Content-Type", "text/html")
|
||||||
|
.withStatus(200)
|
||||||
|
.withHeader("Keep-Alive", "max=4, timeout=30")
|
||||||
|
.withBody("Hello")
|
||||||
|
));
|
||||||
|
wireMockServer.start();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void tearDownAll() {
|
||||||
|
wireMockServer.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
WarcRecorder warcRecorder;
|
||||||
|
Path warcFile;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||||
|
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
|
||||||
|
warcRecorder = new WarcRecorder(warcFile, fetcher);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
fetcher.close();
|
||||||
|
warcRecorder.close();
|
||||||
|
Files.deleteIfExists(warcFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOk_NoProbe() throws IOException {
|
||||||
|
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertTrue(result.isOk());
|
||||||
|
|
||||||
|
List<WarcRecord> warcRecords = getWarcRecords();
|
||||||
|
assertEquals(2, warcRecords.size());
|
||||||
|
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||||
|
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||||
|
|
||||||
|
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||||
|
assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOk_FullProbe() {
|
||||||
|
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertTrue(result.isOk());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOk304_NoProbe() {
|
||||||
|
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOk304_FullProbe() {
|
||||||
|
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadStatus_NoProbe() throws IOException {
|
||||||
|
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertFalse(result.isOk());
|
||||||
|
|
||||||
|
|
||||||
|
List<WarcRecord> warcRecords = getWarcRecords();
|
||||||
|
assertEquals(2, warcRecords.size());
|
||||||
|
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||||
|
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadStatus_FullProbe() {
|
||||||
|
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertFalse(result.isOk());
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
|
||||||
|
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||||
|
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||||
|
|
||||||
|
List<WarcRecord> warcRecords = getWarcRecords();
|
||||||
|
assertEquals(2, warcRecords.size());
|
||||||
|
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||||
|
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRedirect_FullProbe() throws URISyntaxException {
|
||||||
|
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||||
|
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
|
||||||
|
Instant requestStart = Instant.now();
|
||||||
|
|
||||||
|
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||||
|
|
||||||
|
Instant requestEnd = Instant.now();
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||||
|
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||||
|
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||||
|
|
||||||
|
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||||
|
|
||||||
|
var records = getWarcRecords();
|
||||||
|
Assertions.assertEquals(1, records.size());
|
||||||
|
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||||
|
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||||
|
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||||
|
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRangeResponse() throws IOException {
|
||||||
|
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertTrue(result.isOk());
|
||||||
|
|
||||||
|
List<WarcRecord> warcRecords = getWarcRecords();
|
||||||
|
assertEquals(2, warcRecords.size());
|
||||||
|
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||||
|
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||||
|
|
||||||
|
var response = (WarcResponse) warcRecords.get(1);
|
||||||
|
assertEquals("length", response.headers().first("WARC-Truncated").orElse(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
|
||||||
|
Instant requestStart = Instant.now();
|
||||||
|
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
|
Instant requestEnd = Instant.now();
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||||
|
|
||||||
|
|
||||||
|
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||||
|
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||||
|
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||||
|
|
||||||
|
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||||
|
|
||||||
|
var records = getWarcRecords();
|
||||||
|
Assertions.assertEquals(1, records.size());
|
||||||
|
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||||
|
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||||
|
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||||
|
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKeepaliveUrl() {
|
||||||
|
// mostly for smoke testing and debugger utility
|
||||||
|
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||||
|
|
||||||
|
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||||
|
Assertions.assertTrue(result.isOk());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<WarcRecord> getWarcRecords() throws IOException {
|
||||||
|
List<WarcRecord> records = new ArrayList<>();
|
||||||
|
|
||||||
|
System.out.println(Files.readString(warcFile));
|
||||||
|
|
||||||
|
try (var reader = new WarcReader(warcFile)) {
|
||||||
|
WarcXResponseReference.register(reader);
|
||||||
|
WarcXEntityRefused.register(reader);
|
||||||
|
|
||||||
|
for (var record : reader) {
|
||||||
|
records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return records;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@@ -1,9 +1,12 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -13,8 +16,6 @@ import org.netpreserve.jwarc.WarcResponse;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpRequest;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@@ -30,8 +31,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
HttpClient httpClient;
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = HttpClient.newBuilder()
|
httpClient = HttpClients.createDefault();
|
||||||
.build();
|
|
||||||
|
|
||||||
fileName = Files.createTempFile("test", ".warc.gz");
|
fileName = Files.createTempFile("test", ".warc.gz");
|
||||||
outputFile = Files.createTempFile("test", ".warc.gz");
|
outputFile = Files.createTempFile("test", ".warc.gz");
|
||||||
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void run() throws IOException, URISyntaxException {
|
void run() throws IOException, URISyntaxException {
|
||||||
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||||
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||||
|
|
||||||
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
|
||||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,11 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
var req = HttpRequest.newBuilder()
|
var req = ClassicRequestBuilder.get(new java.net.URI(url))
|
||||||
.uri(new java.net.URI(url))
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
.header("User-agent", "test.marginalia.nu")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.header("Accept-Encoding", "gzip")
|
.build();
|
||||||
.GET().build();
|
|
||||||
recorder.fetch(httpClient, req);
|
recorder.fetch(httpClient, req);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -2,10 +2,9 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import com.sun.net.httpserver.HttpServer;
|
import com.sun.net.httpserver.HttpServer;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@@ -32,7 +31,6 @@ class ContentTypeProberTest {
|
|||||||
static EdgeUrl timeoutEndpoint;
|
static EdgeUrl timeoutEndpoint;
|
||||||
|
|
||||||
static Path warcFile;
|
static Path warcFile;
|
||||||
static WarcRecorder recorder;
|
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
void setUp() throws IOException {
|
void setUp() throws IOException {
|
||||||
@@ -80,21 +78,17 @@ class ContentTypeProberTest {
|
|||||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl("test");
|
fetcher = new HttpFetcherImpl("test");
|
||||||
recorder = new WarcRecorder(warcFile, new Cookies());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
void tearDown() throws IOException {
|
void tearDown() throws IOException {
|
||||||
server.stop(0);
|
server.stop(0);
|
||||||
fetcher.close();
|
fetcher.close();
|
||||||
recorder.close();
|
|
||||||
|
|
||||||
Files.deleteIfExists(warcFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void probeContentTypeOk() throws Exception {
|
void probeContentTypeOk() throws Exception {
|
||||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, recorder, ContentTags.empty());
|
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
|
||||||
System.out.println(result);
|
System.out.println(result);
|
||||||
|
|
||||||
@@ -103,16 +97,16 @@ class ContentTypeProberTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void probeContentTypeRedir() throws Exception {
|
void probeContentTypeRedir() throws Exception {
|
||||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, recorder, ContentTags.empty());
|
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
|
||||||
System.out.println(result);
|
System.out.println(result);
|
||||||
|
|
||||||
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Ok(htmlEndpoint));
|
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Redirect(htmlEndpoint));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void probeContentTypeBad() throws Exception {
|
void probeContentTypeBad() throws Exception {
|
||||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, recorder, ContentTags.empty());
|
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
|
||||||
System.out.println(result);
|
System.out.println(result);
|
||||||
|
|
||||||
@@ -121,7 +115,7 @@ class ContentTypeProberTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void probeContentTypeTimeout() throws Exception {
|
void probeContentTypeTimeout() throws Exception {
|
||||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, recorder, ContentTags.empty());
|
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
|
|
||||||
System.out.println(result);
|
System.out.println(result);
|
||||||
|
|
||||||
|
@@ -0,0 +1,160 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import com.sun.net.httpserver.HttpServer;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.netpreserve.jwarc.WarcReader;
|
||||||
|
import org.netpreserve.jwarc.WarcRequest;
|
||||||
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class WarcRecorderFakeServerTest {
|
||||||
|
static HttpServer server;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUpAll() throws IOException {
|
||||||
|
server = HttpServer.create(new InetSocketAddress("127.0.0.1", 14510), 10);
|
||||||
|
|
||||||
|
// This endpoint will finish sending the response immediately
|
||||||
|
server.createContext("/fast", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
|
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>".length());
|
||||||
|
|
||||||
|
try (var os = exchange.getResponseBody()) {
|
||||||
|
os.write("<html><body>hello</body></html>".getBytes());
|
||||||
|
os.flush();
|
||||||
|
}
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
// This endpoint will take 10 seconds to finish sending the response,
|
||||||
|
// which should trigger a timeout in the client
|
||||||
|
server.createContext("/slow", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
|
exchange.sendResponseHeaders(200, "<html><body>hello</body></html>:D".length());
|
||||||
|
|
||||||
|
try (var os = exchange.getResponseBody()) {
|
||||||
|
os.write("<html><body>hello</body></html>".getBytes());
|
||||||
|
os.flush();
|
||||||
|
try {
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
os.write(":".getBytes());
|
||||||
|
os.flush();
|
||||||
|
try {
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
os.write("D".getBytes());
|
||||||
|
os.flush();
|
||||||
|
}
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
server.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void tearDownAll() {
|
||||||
|
server.stop(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
Path fileNameWarc;
|
||||||
|
Path fileNameParquet;
|
||||||
|
WarcRecorder client;
|
||||||
|
|
||||||
|
HttpClient httpClient;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
httpClient = HttpClients.createDefault();
|
||||||
|
|
||||||
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
|
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
client.close();
|
||||||
|
Files.delete(fileNameWarc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void fetchFast() throws Exception {
|
||||||
|
client.fetch(httpClient,
|
||||||
|
ClassicRequestBuilder
|
||||||
|
.get(new java.net.URI("http://localhost:14510/fast"))
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
|
warcReader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
sampleData.put(record.type(), req.target());
|
||||||
|
}
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
sampleData.put(record.type(), rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(sampleData);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void fetchSlow() throws Exception {
|
||||||
|
Instant start = Instant.now();
|
||||||
|
|
||||||
|
client.fetch(httpClient,
|
||||||
|
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
|
||||||
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
.build(),
|
||||||
|
Duration.ofSeconds(1)
|
||||||
|
);
|
||||||
|
Instant end = Instant.now();
|
||||||
|
|
||||||
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
|
warcReader.forEach(record -> {
|
||||||
|
if (record instanceof WarcRequest req) {
|
||||||
|
sampleData.put(record.type(), req.target());
|
||||||
|
}
|
||||||
|
if (record instanceof WarcResponse rsp) {
|
||||||
|
sampleData.put(record.type(), rsp.target());
|
||||||
|
System.out.println(rsp.target());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(sampleData);
|
||||||
|
|
||||||
|
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||||
|
// so we expect the request to take 1s and change before it times out.
|
||||||
|
|
||||||
|
Assertions.assertTrue(Duration.between(start, end).toMillis() < 3000);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -2,11 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
|
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -17,8 +20,6 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
|
||||||
import java.net.http.HttpRequest;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@@ -35,12 +36,12 @@ class WarcRecorderTest {
|
|||||||
HttpClient httpClient;
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = HttpClient.newBuilder().build();
|
httpClient = HttpClients.createDefault();
|
||||||
|
|
||||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
client = new WarcRecorder(fileNameWarc, new Cookies());
|
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -52,11 +53,10 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient,
|
client.fetch(httpClient,
|
||||||
HttpRequest.newBuilder()
|
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
.header("User-agent", "test.marginalia.nu")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.header("Accept-Encoding", "gzip")
|
.build()
|
||||||
.GET().build()
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Map<String, String> sampleData = new HashMap<>();
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
@@ -78,7 +78,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -102,7 +102,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -114,7 +114,7 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSaveImport() throws URISyntaxException, IOException {
|
public void testSaveImport() throws URISyntaxException, IOException {
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -138,23 +138,23 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
client.fetch(httpClient, ClassicRequestBuilder
|
||||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
.get(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.header("User-agent", "test.marginalia.nu")
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.GET().build());
|
.build());
|
||||||
|
|
||||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
client.fetch(httpClient, ClassicRequestBuilder
|
||||||
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
.get(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||||
.header("User-agent", "test.marginalia.nu")
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.GET().build());
|
.build());
|
||||||
|
|
||||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
client.fetch(httpClient, ClassicRequestBuilder
|
||||||
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||||
.header("User-agent", "test.marginalia.nu")
|
.addHeader("User-agent", "test.marginalia.nu")
|
||||||
.header("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
.GET().build());
|
.build());
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
"www.marginalia.nu",
|
"www.marginalia.nu",
|
||||||
|
@@ -31,7 +31,7 @@ class HttpFetcherTest {
|
|||||||
void fetchUTF8() throws Exception {
|
void fetchUTF8() throws Exception {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
System.out.println(bodyOk.contentType());
|
System.out.println(bodyOk.contentType());
|
||||||
}
|
}
|
||||||
@@ -49,7 +49,7 @@ class HttpFetcherTest {
|
|||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
System.out.println(bodyOk.contentType());
|
System.out.println(bodyOk.contentType());
|
||||||
}
|
}
|
||||||
|
@@ -3,7 +3,10 @@ package nu.marginalia.crawling.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.*;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
@@ -15,6 +18,9 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
|||||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
|
import org.apache.hc.core5.http.Header;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -24,7 +30,6 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpHeaders;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@@ -120,7 +125,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Cookies getCookies() { return new Cookies();}
|
public CookieStore getCookies() { return new BasicCookieStore();}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void clearCookies() {}
|
public void clearCookies() {}
|
||||||
@@ -132,13 +137,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||||
logger.info("Probing {}", url);
|
|
||||||
return new HttpFetcher.ContentTypeProbeResult.Ok(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
|
|
||||||
logger.info("Fetching {}", url);
|
logger.info("Fetching {}", url);
|
||||||
if (mockData.containsKey(url)) {
|
if (mockData.containsKey(url)) {
|
||||||
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
||||||
@@ -147,7 +146,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new HttpFetchResult.ResultOk(
|
return new HttpFetchResult.ResultOk(
|
||||||
url.asURI(),
|
url.asURI(),
|
||||||
200,
|
200,
|
||||||
HttpHeaders.of(Map.of(), (k,v)->true),
|
new Header[0],
|
||||||
"127.0.0.1",
|
"127.0.0.1",
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
0,
|
0,
|
||||||
|
@@ -5,7 +5,6 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -16,7 +15,8 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
@@ -37,16 +37,16 @@ class CrawlerRetreiverTest {
|
|||||||
private HttpFetcher httpFetcher;
|
private HttpFetcher httpFetcher;
|
||||||
|
|
||||||
Path tempFileWarc1;
|
Path tempFileWarc1;
|
||||||
Path tempFileParquet1;
|
Path tempFileSlop1;
|
||||||
Path tempFileWarc2;
|
Path tempFileWarc2;
|
||||||
Path tempFileParquet2;
|
Path tempFileSlop2;
|
||||||
Path tempFileWarc3;
|
Path tempFileWarc3;
|
||||||
Path tempFileDb;
|
Path tempFileDb;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileSlop1 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||||
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileSlop2 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||||
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -62,14 +62,14 @@ class CrawlerRetreiverTest {
|
|||||||
if (tempFileWarc1 != null) {
|
if (tempFileWarc1 != null) {
|
||||||
Files.deleteIfExists(tempFileWarc1);
|
Files.deleteIfExists(tempFileWarc1);
|
||||||
}
|
}
|
||||||
if (tempFileParquet1 != null) {
|
if (tempFileSlop1 != null) {
|
||||||
Files.deleteIfExists(tempFileParquet1);
|
Files.deleteIfExists(tempFileSlop1);
|
||||||
}
|
}
|
||||||
if (tempFileWarc2 != null) {
|
if (tempFileWarc2 != null) {
|
||||||
Files.deleteIfExists(tempFileWarc2);
|
Files.deleteIfExists(tempFileWarc2);
|
||||||
}
|
}
|
||||||
if (tempFileParquet2 != null) {
|
if (tempFileSlop2 != null) {
|
||||||
Files.deleteIfExists(tempFileParquet2);
|
Files.deleteIfExists(tempFileSlop2);
|
||||||
}
|
}
|
||||||
if (tempFileWarc3 != null) {
|
if (tempFileWarc3 != null) {
|
||||||
Files.deleteIfExists(tempFileWarc3);
|
Files.deleteIfExists(tempFileWarc3);
|
||||||
@@ -180,7 +180,7 @@ class CrawlerRetreiverTest {
|
|||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc2, new Cookies())
|
new WarcRecorder(tempFileWarc2, new BasicCookieStore())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@@ -224,9 +224,9 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||||
|
|
||||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -277,9 +277,9 @@ class CrawlerRetreiverTest {
|
|||||||
assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||||
assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||||
|
|
||||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -293,7 +293,7 @@ class CrawlerRetreiverTest {
|
|||||||
// redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash)
|
// redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash)
|
||||||
//
|
//
|
||||||
// Ensure that the redirect is followed, and that the trailing slash is added
|
// Ensure that the redirect is followed, and that the trailing slash is added
|
||||||
// to the url as reported in the parquet file.
|
// to the url as reported in the Slop file.
|
||||||
|
|
||||||
var fetchedUrls =
|
var fetchedUrls =
|
||||||
data.stream()
|
data.stream()
|
||||||
@@ -326,9 +326,9 @@ class CrawlerRetreiverTest {
|
|||||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||||
|
|
||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||||
|
|
||||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
if (stream.next() instanceof CrawledDocument doc) {
|
if (stream.next() instanceof CrawledDocument doc) {
|
||||||
data.add(doc);
|
data.add(doc);
|
||||||
@@ -373,11 +373,11 @@ class CrawlerRetreiverTest {
|
|||||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||||
|
|
||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||||
doCrawlWithReferenceStream(specs,
|
doCrawlWithReferenceStream(specs,
|
||||||
new CrawlDataReference(tempFileParquet1)
|
new CrawlDataReference(tempFileSlop1)
|
||||||
);
|
);
|
||||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
convertToSlop(tempFileWarc2, tempFileSlop2);
|
||||||
|
|
||||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||||
WarcXResponseReference.register(reader);
|
WarcXResponseReference.register(reader);
|
||||||
@@ -396,7 +396,7 @@ class CrawlerRetreiverTest {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||||
while (ds.hasNext()) {
|
while (ds.hasNext()) {
|
||||||
var doc = ds.next();
|
var doc = ds.next();
|
||||||
if (doc instanceof CrawledDomain dr) {
|
if (doc instanceof CrawledDomain dr) {
|
||||||
@@ -411,9 +411,9 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) {
|
private void convertToSlop(Path tempFileWarc2, Path tempFileSlop2) throws IOException {
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
SlopCrawlDataRecord.convertWarc("www.marginalia.nu",
|
||||||
new UserAgent("test", "test"), tempFileWarc2, tempFileParquet2);
|
new UserAgent("test", "test"), tempFileWarc2, tempFileSlop2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -436,9 +436,9 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
|
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||||
|
|
||||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||||
while (stream.hasNext()) {
|
while (stream.hasNext()) {
|
||||||
var doc = stream.next();
|
var doc = stream.next();
|
||||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||||
@@ -449,14 +449,14 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
System.out.println("---");
|
System.out.println("---");
|
||||||
|
|
||||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileSlop1));
|
||||||
|
|
||||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
|
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc3, new Cookies())
|
new WarcRecorder(tempFileWarc3, new BasicCookieStore())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@@ -465,7 +465,7 @@ class CrawlerRetreiverTest {
|
|||||||
resync.run(tempFileWarc2);
|
resync.run(tempFileWarc2);
|
||||||
|
|
||||||
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
||||||
convertToParquet(tempFileWarc3, tempFileParquet2);
|
convertToSlop(tempFileWarc3, tempFileSlop2);
|
||||||
|
|
||||||
|
|
||||||
try (var reader = new WarcReader(tempFileWarc3)) {
|
try (var reader = new WarcReader(tempFileWarc3)) {
|
||||||
@@ -485,7 +485,7 @@ class CrawlerRetreiverTest {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||||
while (ds.hasNext()) {
|
while (ds.hasNext()) {
|
||||||
var doc = ds.next();
|
var doc = ds.next();
|
||||||
if (doc instanceof CrawledDomain dr) {
|
if (doc instanceof CrawledDomain dr) {
|
||||||
@@ -507,7 +507,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||||
@@ -519,7 +519,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||||
|
@@ -44,6 +44,7 @@ dependencies {
|
|||||||
implementation libs.guice
|
implementation libs.guice
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
|
implementation libs.bundles.httpcomponents
|
||||||
|
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
testImplementation project(':code:libraries:test-helpers')
|
testImplementation project(':code:libraries:test-helpers')
|
||||||
|
@@ -10,7 +10,6 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.Cookies;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
@@ -44,6 +43,7 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
|
|||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.test.IntegrationTestModule;
|
import nu.marginalia.test.IntegrationTestModule;
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -121,7 +121,7 @@ public class IntegrationTest {
|
|||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
/** CREATE WARC */
|
/** CREATE WARC */
|
||||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new BasicCookieStore())) {
|
||||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||||
|
|
||||||
|
@@ -179,8 +179,9 @@ dependencyResolutionManagement {
|
|||||||
|
|
||||||
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
|
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
|
||||||
|
|
||||||
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
|
||||||
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
library('httpcore', 'org.apache.httpcomponents.core5','httpcore5').version('5.3.4')
|
||||||
|
library('httpclient', 'org.apache.httpcomponents.client5','httpclient5').version('5.4.3')
|
||||||
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
||||||
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
||||||
library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
|
library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
|
||||||
@@ -255,7 +256,7 @@ dependencyResolutionManagement {
|
|||||||
bundle('grpc', ['protobuf', 'grpc-stub', 'grpc-protobuf', 'grpc-netty'])
|
bundle('grpc', ['protobuf', 'grpc-stub', 'grpc-protobuf', 'grpc-netty'])
|
||||||
bundle('protobuf', ['protobuf', 'javax.annotation'])
|
bundle('protobuf', ['protobuf', 'javax.annotation'])
|
||||||
bundle('gson', ['gson', 'gson-type-adapter'])
|
bundle('gson', ['gson', 'gson-type-adapter'])
|
||||||
bundle('httpcomponents', ['httpcomponents.core', 'httpcomponents.client'])
|
bundle('httpcomponents', ['httpcore', 'httpclient'])
|
||||||
bundle('parquet', ['parquet-column', 'parquet-hadoop'])
|
bundle('parquet', ['parquet-column', 'parquet-hadoop'])
|
||||||
bundle('junit', ['junit.jupiter', 'junit.jupiter.engine'])
|
bundle('junit', ['junit.jupiter', 'junit.jupiter.engine'])
|
||||||
bundle('flyway', ['flyway.core', 'flyway.mysql'])
|
bundle('flyway', ['flyway.core', 'flyway.mysql'])
|
||||||
|
Reference in New Issue
Block a user