1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

..

8 Commits

Author SHA1 Message Date
Viktor Lofgren
3c43f1954e (crawler) Add custom cookie store implementation
Apache HttpClient's cookie implementation builds an enormous concurrent hashmap with every cookie for every domain ever crawled.  This is a big waste of resources.

Replacing it with a fairly crude domain-isolated instance, as we are primarily interested in answering whether a cookie is set, and we will never retain cookies long term.
2025-04-18 13:04:22 +02:00
Viktor Lofgren
fa2462ec39 (crawler) Re-enable aborts on timeout 2025-04-18 12:59:34 +02:00
Viktor Lofgren
f4ad7145db (crawler) Disable SO_LINGER 2025-04-18 01:42:02 +02:00
Viktor Lofgren
068b450180 (crawler) Temporarily disable request.abort() 2025-04-18 01:25:56 +02:00
Viktor Lofgren
05b909a21f (crawler) Add logging to get more info about connection leak 2025-04-18 01:06:52 +02:00
Viktor Lofgren
3d179cddce (crawler) Correctly consume entity in sitemap retrieval 2025-04-18 00:32:21 +02:00
Viktor Lofgren
1a2aae496a (crawler) Correct handling and abortion of HttpClient's requests
There was a resource leak in the initial implementation of the Apache HttpClient WarcInputBuffer that failed to free up resources.

Using HttpGet objects instead of the Classic...Request objects, as the latter fail to expose an abort()-method.
2025-04-18 00:16:26 +02:00
Viktor Lofgren
353cdffb3f (crawler) Increase connection request timeout, restore congestion timeout 2025-04-17 21:32:06 +02:00
22 changed files with 381 additions and 255 deletions

View File

@@ -20,7 +20,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName, new BasicCookieStore());
try (var recorder = new WarcRecorder(fileName);
var db = new DomainStateDb(dbTempFile))
{
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();

View File

@@ -438,7 +438,7 @@ public class CrawlerMain extends ProcessMainClass {
Files.deleteIfExists(tempFile);
}
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
CrawlDataReference reference = getReference()
)

View File

@@ -1,6 +1,6 @@
package nu.marginalia.crawl.fetcher;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.apache.hc.client5.http.classic.methods.HttpGet;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
}
/** Paints the tags onto the request builder. */
public void paint(ClassicRequestBuilder getBuilder) {
public void paint(HttpGet request) {
if (etag != null) {
getBuilder.addHeader("If-None-Match", etag);
request.addHeader("If-None-Match", etag);
}
if (lastMod != null) {
getBuilder.addHeader("If-Modified-Since", lastMod);
request.addHeader("If-Modified-Since", lastMod);
}
}
}

View File

@@ -1,34 +0,0 @@
package nu.marginalia.crawl.fetcher;
import java.io.IOException;
import java.net.CookieHandler;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class Cookies extends CookieHandler {
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
public void clear() {
cookieJar.get().clear();
}
public boolean hasCookies() {
return !cookieJar.get().isEmpty();
}
public List<String> getCookies() {
return cookieJar.get().values().stream().flatMap(List::stream).toList();
}
@Override
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
return cookieJar.get();
}
@Override
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
cookieJar.get().putAll(responseHeaders);
}
}

View File

@@ -0,0 +1,56 @@
package nu.marginalia.crawl.fetcher;
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.core5.http.HttpResponse;
import java.util.HashMap;
import java.util.Map;
import java.util.StringJoiner;
public class DomainCookies {
private final Map<String, String> cookies = new HashMap<>();
public boolean hasCookies() {
return !cookies.isEmpty();
}
public void updateCookieStore(HttpResponse response) {
for (var header : response.getHeaders()) {
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
parseCookieHeader(header.getValue());
}
}
}
private void parseCookieHeader(String value) {
// Parse the Set-Cookie header value and extract the cookies
String[] parts = value.split(";");
String cookie = parts[0].trim();
if (cookie.contains("=")) {
String[] cookieParts = cookie.split("=");
String name = cookieParts[0].trim();
String val = cookieParts[1].trim();
cookies.put(name, val);
}
}
public void paintRequest(HttpUriRequestBase request) {
request.addHeader("Cookie", createCookieHeader());
}
public void paintRequest(ClassicHttpRequest request) {
request.addHeader("Cookie", createCookieHeader());
}
private String createCookieHeader() {
StringJoiner sj = new StringJoiner("; ");
for (var cookie : cookies.entrySet()) {
sj.add(cookie.getKey() + "=" + cookie.getValue());
}
return sj.toString();
}
}

View File

@@ -23,6 +23,7 @@ public interface HttpFetcher extends AutoCloseable {
HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder,
DomainCookies cookies,
CrawlDelayTimer timer,
ContentTags tags,
ProbeType probeType);

View File

@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.apache.hc.core5.http.message.MessageSupport;
import org.apache.hc.core5.http.protocol.HttpContext;
import org.apache.hc.core5.pool.PoolStats;
import org.apache.hc.core5.util.TimeValue;
import org.apache.hc.core5.util.Timeout;
import org.jsoup.Jsoup;
@@ -76,14 +78,20 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
private final CloseableHttpClient client;
private PoolingHttpClientConnectionManager connectionManager;
public PoolStats getPoolStats() {
return connectionManager.getTotalStats();
}
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setSocketTimeout(10, TimeUnit.SECONDS)
.setConnectTimeout(30, TimeUnit.SECONDS)
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(5000)
.setDefaultConnectionConfig(connectionConfig)
@@ -91,15 +99,27 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(15))
.setSoLinger(TimeValue.ofSeconds(-1))
.setSoTimeout(Timeout.ofSeconds(10))
.build()
);
Thread.ofPlatform().daemon(true).start(() -> {
try {
for (;;) {
TimeUnit.SECONDS.sleep(15);
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.RELAXED)
.setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build();
return HttpClients.custom()
@@ -287,6 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
* recorded in the WARC file on failure.
*/
public ContentTypeProbeResult probeContentType(EdgeUrl url,
DomainCookies cookies,
CrawlDelayTimer timer,
ContentTags tags) {
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
@@ -299,9 +320,11 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
.addHeader("Accept-Encoding", "gzip")
.build();
var result = SendLock.wrapSend(client, head, (rsp) -> {
EntityUtils.consume(rsp.getEntity());
cookies.paintRequest(head);
return SendLock.wrapSend(client, head, (rsp) -> {
cookies.updateCookieStore(rsp);
EntityUtils.consume(rsp.getEntity());
int statusCode = rsp.getCode();
// Handle redirects
@@ -339,8 +362,6 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
}
});
return result;
}
catch (SocketTimeoutException ex) {
@@ -362,6 +383,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
@Override
public HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder warcRecorder,
DomainCookies cookies,
CrawlDelayTimer timer,
ContentTags contentTags,
ProbeType probeType)
@@ -369,7 +391,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
try {
if (probeType == HttpFetcher.ProbeType.FULL) {
try {
var probeResult = probeContentType(url, timer, contentTags);
var probeResult = probeContentType(url, cookies, timer, contentTags);
logger.info(crawlerAuditMarker, "Probe result {} for {}", probeResult.getClass().getSimpleName(), url);
switch (probeResult) {
case HttpFetcher.ContentTypeProbeResult.NoOp():
@@ -398,16 +420,16 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
HttpGet request = new HttpGet(url.asURI());
request.addHeader("User-Agent", userAgentString);
request.addHeader("Accept-Encoding", "gzip");
request.addHeader("Accept-Language", "en,*;q=0.5");
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
contentTags.paint(getBuilder);
contentTags.paint(request);
try (var sl = new SendLock()) {
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) {
@@ -494,56 +516,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.addHeader("User-Agent", userAgentString)
.build();
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
getRequest.addHeader("User-Agent", userAgentString);
getRequest.addHeader("Accept-Encoding", "gzip");
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
getRequest.addHeader("User-Agent", userAgentString);
try (var sl = new SendLock()) {
return client.execute(getRequest, response -> {
if (response.getCode() != 200) {
return new SitemapResult.SitemapError();
try {
if (response.getCode() != 200) {
return new SitemapResult.SitemapError();
}
Document parsedSitemap = Jsoup.parse(
EntityUtils.toString(response.getEntity()),
sitemapUrl.toString(),
Parser.xmlParser()
);
if (parsedSitemap.childrenSize() == 0) {
return new SitemapResult.SitemapError();
}
String rootTagName = parsedSitemap.child(0).tagName();
return switch (rootTagName.toLowerCase()) {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(locTag.text().trim());
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
case "rss", "atom" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("link, url")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
default -> new SitemapResult.SitemapError();
};
}
Document parsedSitemap = Jsoup.parse(
EntityUtils.toString(response.getEntity()),
sitemapUrl.toString(),
Parser.xmlParser()
);
if (parsedSitemap.childrenSize() == 0) {
return new SitemapResult.SitemapError();
finally {
EntityUtils.consume(response.getEntity());
}
String rootTagName = parsedSitemap.child(0).tagName();
return switch (rootTagName.toLowerCase()) {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(locTag.text().trim());
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
case "rss", "atom" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("link, url")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
default -> new SitemapResult.SitemapError();
};
});
}
catch (Exception ex) {
@@ -574,13 +601,12 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try (var sl = new SendLock()) {
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.build();
HttpGet request = new HttpGet(url.asURI());
request.addHeader("User-Agent", userAgentString);
request.addHeader("Accept-Encoding", "gzip");
request.addHeader("Accept", "text/*, */*;q=0.9");
HttpFetchResult result = recorder.fetch(client, request);
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
robotsParser.parseContent(url.toString(),

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.ClassicHttpResponse;
import org.apache.hc.core5.http.Header;
import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status.
*/
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
static WarcInputBuffer forResponse(ClassicHttpResponse response,
HttpGet request,
Duration timeLimit) throws IOException {
if (response == null)
return new ErrorBuffer();
@@ -54,16 +57,29 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer();
}
InputStream is = entity.getContent();
long length = entity.getContentLength();
InputStream is = null;
try {
is = entity.getContent();
long length = entity.getContentLength();
try (response) {
if (length > 0 && length < 8192) {
// If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
} else {
// Otherwise, we unpack it into a file and read it from there
return new FileBuffer(response.getHeaders(), timeLimit, is);
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
}
}
finally {
try {
is.skip(Long.MAX_VALUE);
}
catch (IOException e) {
// Ignore the exception
}
finally {
// Close the input stream
IOUtils.closeQuietly(is);
}
}
@@ -71,7 +87,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
/** Copy an input stream to an output stream, with a maximum size and time limit */
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
Instant start = Instant.now();
Instant timeout = start.plus(timeLimit);
long size = 0;
@@ -86,6 +102,11 @@ public abstract class WarcInputBuffer implements AutoCloseable {
Duration remaining = Duration.between(Instant.now(), timeout);
if (remaining.isNegative()) {
truncationReason = WarcTruncationReason.TIME;
// Abort the request if the time limit is exceeded
// so we don't keep the connection open forever or are forced to consume
// the stream to the end
request.abort();
break;
}
@@ -104,6 +125,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
else if (truncationReason != WarcTruncationReason.LENGTH) {
truncationReason = WarcTruncationReason.LENGTH;
break;
}
} catch (IOException e) {
@@ -111,13 +133,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
}
// Try to close the connection as long as we haven't timed out.
// As per Apache HttpClient's semantics, this will reset the connection
// and close the stream if we have timed out.
if (truncationReason != WarcTruncationReason.TIME) {
IOUtils.closeQuietly(is);
}
}
/** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +233,7 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer {
byte[] data;
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) {
@@ -229,7 +244,7 @@ class MemoryBuffer extends WarcInputBuffer {
var outputStream = new ByteArrayOutputStream(size);
copy(responseStream, outputStream, timeLimit);
copy(responseStream, request, outputStream, timeLimit);
data = outputStream.toByteArray();
}
@@ -253,7 +268,7 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer {
private final Path tempFile;
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) {
@@ -265,7 +280,7 @@ class FileBuffer extends WarcInputBuffer {
this.tempFile = Files.createTempFile("rsp", ".html");
try (var out = Files.newOutputStream(tempFile)) {
copy(responseStream, out, timeLimit);
copy(responseStream, request, out, timeLimit);
}
catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED;

View File

@@ -1,6 +1,7 @@
package nu.marginalia.crawl.fetcher.warc;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.link_parser.LinkParser;
@@ -8,9 +9,7 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.NameValuePair;
import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*;
@@ -53,23 +52,15 @@ public class WarcRecorder implements AutoCloseable {
// Affix a version string in case we need to change the format in the future
// in some way
private final String warcRecorderVersion = "1.0";
private final CookieStore cookies;
private final LinkParser linkParser = new LinkParser();
/**
* Create a new WarcRecorder that will write to the given file
*
* @param warcFile The file to write to
*/
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
public WarcRecorder(Path warcFile) throws IOException {
this.warcFile = warcFile;
this.writer = new WarcWriter(warcFile);
this.cookies = fetcher.getCookies();
}
public WarcRecorder(Path warcFile, CookieStore cookies) throws IOException {
this.warcFile = warcFile;
this.writer = new WarcWriter(warcFile);
this.cookies = cookies;
}
/**
@@ -79,24 +70,21 @@ public class WarcRecorder implements AutoCloseable {
public WarcRecorder() throws IOException {
this.warcFile = Files.createTempFile("warc", ".warc.gz");
this.writer = new WarcWriter(this.warcFile);
this.cookies = new BasicCookieStore();
temporaryFile = true;
}
private boolean hasCookies() {
return !cookies.getCookies().isEmpty();
}
public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request)
DomainCookies cookies,
HttpGet request)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{
return fetch(client, request, Duration.ofMillis(MAX_TIME));
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
}
public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request,
DomainCookies cookies,
HttpGet request,
Duration timeout)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{
@@ -113,13 +101,15 @@ public class WarcRecorder implements AutoCloseable {
// Inject a range header to attempt to limit the size of the response
// to the maximum size we want to store, if the server supports it.
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
cookies.paintRequest(request);
try {
return client.execute(request, response -> {
return client.execute(request,response -> {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) {
cookies.updateCookieStore(response);
// Build and write the request
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
@@ -143,8 +133,9 @@ public class WarcRecorder implements AutoCloseable {
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcRequest);
if (hasCookies()) {
extraHeaders.put("X-Has-Cookies", List.of("1"));
if (cookies.hasCookies()) {
response.addHeader("X-Has-Cookies", 1);
}
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
@@ -259,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
writer.write(item);
}
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
try {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@@ -320,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
.date(Instant.now())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
if (hasCookies()) {
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
builder.addHeader("X-Has-Cookies", "1");
}
@@ -340,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
*/
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
}
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {

View File

@@ -6,6 +6,7 @@ import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.LinkFilterSelector;
@@ -51,9 +52,10 @@ public class CrawlerRetreiver implements AutoCloseable {
private final DomainStateDb domainStateDb;
private final WarcRecorder warcRecorder;
private final CrawlerRevisitor crawlerRevisitor;
private final DomainCookies cookies = new DomainCookies();
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
Duration.ofMillis(50) // pace the connections to avoid network congestion at startup
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
);
int errorCount = 0;
@@ -124,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
}
Instant recrawlStart = Instant.now();
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
@@ -274,7 +276,7 @@ public class CrawlerRetreiver implements AutoCloseable {
try {
var url = rootUrl.withPathAndParam("/", null);
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
timer.waitFetchDelay(0);
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
@@ -337,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
// Grab the favicon if it exists
if (fetcher.fetchContent(faviconUrl, warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
String contentType = iconResult.header("Content-Type");
byte[] iconData = iconResult.getBodyBytes();
@@ -407,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
if (parsedOpt.isEmpty())
return false;
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
timer.waitFetchDelay(0);
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
@@ -435,7 +437,7 @@ public class CrawlerRetreiver implements AutoCloseable {
{
var contentTags = reference.getContentTags();
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, timer, contentTags, HttpFetcher.ProbeType.FULL);
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
timer.waitFetchDelay();
if (Thread.interrupted()) {
@@ -461,7 +463,7 @@ public class CrawlerRetreiver implements AutoCloseable {
{
var doc = reference.doc();
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
new ContentType(doc.contentType, "UTF-8"),

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -37,6 +38,7 @@ public class CrawlerRevisitor {
/** Performs a re-crawl of old documents, comparing etags and last-modified */
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
DomainCookies cookies,
SimpleRobotRules robotsRules,
CrawlDelayTimer delayTimer)
throws InterruptedException {
@@ -132,6 +134,7 @@ public class CrawlerRevisitor {
}
// Add a WARC record so we don't repeat this
warcRecorder.writeReferenceCopy(url,
cookies,
doc.contentType,
doc.httpStatus,
doc.documentBodyBytes,

View File

@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class HttpFetcherImplContentTypeProbeTest {
@@ -85,55 +87,59 @@ class HttpFetcherImplContentTypeProbeTest {
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close();
}
@Test
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
}
@Test
public void testProbeContentTypeHtmlShortcircuitTags() {
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
}
@Test
public void testProbeContentTypeHtml() {
var result = fetcher.probeContentType(contentTypeHtmlUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
}
@Test
public void testProbeContentTypeBinary() {
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
}
@Test
public void testProbeContentTypeRedirect() {
var result = fetcher.probeContentType(redirectUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
}
@Test
public void testProbeContentTypeBadHttpStatus() {
var result = fetcher.probeContentType(badHttpStatusUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
}
@Test
public void testOnlyGetAllowed() {
var result = fetcher.probeContentType(onlyGetAllowedUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
}
@Test
public void testTimeout() {
var result = fetcher.probeContentType(timeoutUrl, new CrawlDelayTimer(50), ContentTags.empty());
var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
}

View File

@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class HttpFetcherImplDomainProbeTest {
@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close();
}

View File

@@ -31,6 +31,7 @@ class HttpFetcherImplFetchTest {
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
private static EdgeUrl okUrl;
private static EdgeUrl okUrlSetsCookie;
private static EdgeUrl okRangeResponseUrl;
private static EdgeUrl okUrlWith304;
@@ -88,6 +89,19 @@ class HttpFetcherImplFetchTest {
.withStatus(200)
.withBody("Hello World")));
okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
.willReturn(WireMock.aResponse()
.withHeader("Content-Type", "text/html")
.withHeader("Set-Cookie", "test=1")
.withStatus(200)));
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
.willReturn(WireMock.aResponse()
.withHeader("Content-Type", "text/html")
.withHeader("Set-Cookie", "test=1")
.withStatus(200)
.withBody("Hello World")));
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
.willReturn(WireMock.aResponse()
@@ -117,6 +131,8 @@ class HttpFetcherImplFetchTest {
.withHeader("Keep-Alive", "max=4, timeout=30")
.withBody("Hello")
));
wireMockServer.start();
}
@@ -134,20 +150,31 @@ class HttpFetcherImplFetchTest {
public void setUp() throws IOException {
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
warcRecorder = new WarcRecorder(warcFile, fetcher);
warcRecorder = new WarcRecorder(warcFile);
}
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
System.out.println(stats);
fetcher.close();
warcRecorder.close();
Files.deleteIfExists(warcFile);
}
@Test
public void testFoo() {
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
}
@Test
public void testOk_NoProbe() throws IOException {
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
@@ -158,12 +185,29 @@ class HttpFetcherImplFetchTest {
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
WarcResponse response = (WarcResponse) warcRecords.get(1);
assertEquals("0", response.headers().first("X-Has-Cookies").orElse("0"));
assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
}
@Test
public void testOkSetsCookie() throws IOException {
var cookies = new DomainCookies();
var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
List<WarcRecord> warcRecords = getWarcRecords();
assertEquals(2, warcRecords.size());
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
WarcResponse response = (WarcResponse) warcRecords.get(1);
assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
}
@Test
public void testOk_FullProbe() {
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
@@ -171,7 +215,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testOk304_NoProbe() {
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
System.out.println(result);
@@ -180,7 +224,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testOk304_FullProbe() {
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
System.out.println(result);
@@ -188,7 +232,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testBadStatus_NoProbe() throws IOException {
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertFalse(result.isOk());
@@ -202,7 +246,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testBadStatus_FullProbe() {
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertFalse(result.isOk());
@@ -212,7 +256,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -225,7 +269,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testRedirect_FullProbe() throws URISyntaxException {
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
@@ -238,7 +282,7 @@ class HttpFetcherImplFetchTest {
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
Instant requestStart = Instant.now();
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
@@ -262,7 +306,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testRangeResponse() throws IOException {
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
@@ -279,7 +323,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
Instant requestStart = Instant.now();
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Instant requestEnd = Instant.now();
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
@@ -302,7 +346,7 @@ class HttpFetcherImplFetchTest {
@Test
public void testKeepaliveUrl() {
// mostly for smoke testing and debugger utility
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
@@ -319,6 +363,13 @@ class HttpFetcherImplFetchTest {
WarcXEntityRefused.register(reader);
for (var record : reader) {
// Load the body, we need to do this before we close the reader to have access to the content.
if (record instanceof WarcRequest req) {
req.http();
} else if (record instanceof WarcResponse rsp) {
rsp.http();
}
records.add(record);
}
}

View File

@@ -1,12 +1,12 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
@Test
void run() throws IOException, URISyntaxException {
try (var oldRecorder = new WarcRecorder(fileName, new BasicCookieStore())) {
try (var oldRecorder = new WarcRecorder(fileName)) {
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
try (var newRecorder = new WarcRecorder(outputFile, new BasicCookieStore())) {
try (var newRecorder = new WarcRecorder(outputFile)) {
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
}
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
}
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = ClassicRequestBuilder.get(new java.net.URI(url))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build();
recorder.fetch(httpClient, req);
HttpGet request = new HttpGet(url);
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
recorder.fetch(httpClient, new DomainCookies(), request);
}
}

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@@ -88,7 +89,7 @@ class ContentTypeProberTest {
@Test
void probeContentTypeOk() throws Exception {
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
System.out.println(result);
@@ -97,7 +98,7 @@ class ContentTypeProberTest {
@Test
void probeContentTypeRedir() throws Exception {
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
System.out.println(result);
@@ -106,7 +107,7 @@ class ContentTypeProberTest {
@Test
void probeContentTypeBad() throws Exception {
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
System.out.println(result);
@@ -115,7 +116,7 @@ class ContentTypeProberTest {
@Test
void probeContentTypeTimeout() throws Exception {
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new CrawlDelayTimer(50), ContentTags.empty());
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
System.out.println(result);

View File

@@ -1,11 +1,11 @@
package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
os.write("<html><body>hello</body></html>".getBytes());
os.flush();
try {
TimeUnit.SECONDS.sleep(1);
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
os.write(":".getBytes());
os.flush();
try {
TimeUnit.SECONDS.sleep(1);
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
@@ -89,24 +89,22 @@ class WarcRecorderFakeServerTest {
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
client = new WarcRecorder(fileNameWarc);
}
@AfterEach
public void tearDown() throws Exception {
client.close();
Files.delete(fileNameWarc);
}
@Test
public void fetchFast() throws Exception {
client.fetch(httpClient,
ClassicRequestBuilder
.get(new java.net.URI("http://localhost:14510/fast"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
);
HttpGet request = new HttpGet("http://localhost:14510/fast");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request);
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +125,13 @@ class WarcRecorderFakeServerTest {
public void fetchSlow() throws Exception {
Instant start = Instant.now();
HttpGet request = new HttpGet("http://localhost:14510/slow");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build(),
new DomainCookies(),
request,
Duration.ofSeconds(1)
);
Instant end = Instant.now();
@@ -149,6 +149,8 @@ class WarcRecorderFakeServerTest {
});
}
System.out.println(
Files.readString(fileNameWarc));
System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond,

View File

@@ -2,14 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -41,7 +41,7 @@ class WarcRecorderTest {
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
client = new WarcRecorder(fileNameWarc, new BasicCookieStore());
client = new WarcRecorder(fileNameWarc);
}
@AfterEach
@@ -52,12 +52,12 @@ class WarcRecorderTest {
@Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
);
HttpGet request = new HttpGet("https://www.marginalia.nu/");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request);
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -78,8 +78,9 @@ class WarcRecorderTest {
@Test
public void flagAsSkipped() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
new DomainCookies(),
"text/html",
200,
"<?doctype html><html><body>test</body></html>".getBytes(),
@@ -102,8 +103,9 @@ class WarcRecorderTest {
@Test
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
new DomainCookies(),
"text/html",
200,
null,
@@ -114,8 +116,9 @@ class WarcRecorderTest {
@Test
public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc, new BasicCookieStore())) {
try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
new DomainCookies(),
"text/html",
200,
"<?doctype html><html><body>test</body></html>".getBytes(),
@@ -138,23 +141,23 @@ class WarcRecorderTest {
@Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
request1.addHeader("User-agent", "test.marginalia.nu");
request1.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/log/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, new DomainCookies(), request1);
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
request2.addHeader("User-agent", "test.marginalia.nu");
request2.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request2);
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
request3.addHeader("User-agent", "test.marginalia.nu");
request3.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request3);
CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu",

View File

@@ -1,6 +1,7 @@
package nu.marginalia.crawling;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@@ -31,7 +32,7 @@ class HttpFetcherTest {
void fetchUTF8() throws Exception {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}
@@ -49,7 +50,7 @@ class HttpFetcherTest {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
System.out.println(bodyOk.contentType());
}

View File

@@ -3,10 +3,7 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.fetcher.*;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
@@ -137,7 +134,7 @@ public class CrawlerMockFetcherTest {
}
@Override
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
logger.info("Fetching {}", url);
if (mockData.containsKey(url)) {
byte[] bodyBytes = mockData.get(url).documentBodyBytes;

View File

@@ -16,7 +16,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.slop.SlopCrawlDataRecord;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.*;
@@ -180,7 +179,7 @@ class CrawlerRetreiverTest {
new EdgeDomain("www.marginalia.nu"),
List.of(), 100);
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
new WarcRecorder(tempFileWarc2, new BasicCookieStore())
new WarcRecorder(tempFileWarc2)
);
// truncate the size of the file to simulate a crash
@@ -456,7 +455,7 @@ class CrawlerRetreiverTest {
List.of(), 100);
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
new WarcRecorder(tempFileWarc3, new BasicCookieStore())
new WarcRecorder(tempFileWarc3)
);
// truncate the size of the file to simulate a crash
@@ -507,7 +506,7 @@ class CrawlerRetreiverTest {
}
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
try (var recorder = new WarcRecorder(tempFileWarc2, new BasicCookieStore());
try (var recorder = new WarcRecorder(tempFileWarc2);
var db = new DomainStateDb(tempFileDb)
) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
@@ -519,7 +518,7 @@ class CrawlerRetreiverTest {
@NotNull
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
try (var recorder = new WarcRecorder(tempFileWarc1, new BasicCookieStore());
try (var recorder = new WarcRecorder(tempFileWarc1);
var db = new DomainStateDb(tempFileDb)
) {
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);

View File

@@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.functions.searchquery.QueryFactory;
@@ -43,7 +44,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.test.IntegrationTestModule;
import nu.marginalia.test.TestUtil;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -121,11 +121,12 @@ public class IntegrationTest {
public void run() throws Exception {
/** CREATE WARC */
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new BasicCookieStore())) {
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
new DomainCookies(),
"text/html", 200,
"""
<html>