1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

4 Commits

Author SHA1 Message Date
Viktor Lofgren
1a2aae496a (crawler) Correct handling and abortion of HttpClient's requests
There was a resource leak in the initial implementation of the Apache HttpClient WarcInputBuffer that failed to free up resources.

Using HttpGet objects instead of the Classic...Request objects, as the latter fail to expose an abort()-method.
2025-04-18 00:16:26 +02:00
Viktor Lofgren
353cdffb3f (crawler) Increase connection request timeout, restore congestion timeout 2025-04-17 21:32:06 +02:00
Viktor Lofgren
2e3f1313c7 (crawler) Log exceptions while crawling in crawler audit log 2025-04-17 21:18:09 +02:00
Viktor Lofgren
58e6f141ce (crawler) Reduce congestion throttle go-rate 2025-04-17 20:36:58 +02:00
9 changed files with 106 additions and 88 deletions

View File

@@ -1,6 +1,6 @@
package nu.marginalia.crawl.fetcher; package nu.marginalia.crawl.fetcher;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder; import org.apache.hc.client5.http.classic.methods.HttpGet;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */ /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) { public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
} }
/** Paints the tags onto the request builder. */ /** Paints the tags onto the request builder. */
public void paint(ClassicRequestBuilder getBuilder) { public void paint(HttpGet request) {
if (etag != null) { if (etag != null) {
getBuilder.addHeader("If-None-Match", etag); request.addHeader("If-None-Match", etag);
} }
if (lastMod != null) { if (lastMod != null) {
getBuilder.addHeader("If-Modified-Since", lastMod); request.addHeader("If-Modified-Since", lastMod);
} }
} }
} }

View File

@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy; import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.HttpRequestRetryStrategy; import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig; import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -99,7 +100,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
final RequestConfig defaultRequestConfig = RequestConfig.custom() final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.RELAXED) .setCookieSpec(StandardCookieSpec.RELAXED)
.setResponseTimeout(10, TimeUnit.SECONDS) .setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(8, TimeUnit.SECONDS) .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build(); .build();
return HttpClients.custom() return HttpClients.custom()
@@ -398,16 +399,16 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
} }
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI()) HttpGet request = new HttpGet(url.asURI());
.addHeader("User-Agent", userAgentString) request.addHeader("User-Agent", userAgentString);
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Language", "en,*;q=0.5") request.addHeader("Accept-Language", "en,*;q=0.5");
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8"); request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
contentTags.paint(getBuilder); contentTags.paint(request);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); HttpFetchResult result = warcRecorder.fetch(client, request);
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
@@ -419,7 +420,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url); case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url); case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url); case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url); case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url); case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url); case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
} }
@@ -494,13 +495,13 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
} }
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException { private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI()) HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip") getRequest.addHeader("User-Agent", userAgentString);
.addHeader("Accept", "text/*, */*;q=0.9") getRequest.addHeader("Accept-Encoding", "gzip");
.addHeader("User-Agent", userAgentString) getRequest.addHeader("Accept", "text/*, */*;q=0.9");
.build(); getRequest.addHeader("User-Agent", userAgentString);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
return client.execute(getRequest, response -> { return client.execute(getRequest, response -> {
@@ -574,11 +575,10 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) { private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI()) HttpGet request = new HttpGet(url.asURI());
.addHeader("User-Agent", userAgentString) request.addHeader("User-Agent", userAgentString);
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept", "text/*, */*;q=0.9") request.addHeader("Accept", "text/*, */*;q=0.9");
.build();
HttpFetchResult result = recorder.fetch(client, request); HttpFetchResult result = recorder.fetch(client, request);

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.BOMInputStream;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.ClassicHttpResponse; import org.apache.hc.core5.http.ClassicHttpResponse;
import org.apache.hc.core5.http.Header; import org.apache.hc.core5.http.Header;
import org.netpreserve.jwarc.WarcTruncationReason; import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers. * and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status. * If an error occurs, a buffer will be created with no content and an error status.
*/ */
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException { static WarcInputBuffer forResponse(ClassicHttpResponse response,
HttpGet request,
Duration timeLimit) throws IOException {
if (response == null) if (response == null)
return new ErrorBuffer(); return new ErrorBuffer();
@@ -57,13 +60,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
InputStream is = entity.getContent(); InputStream is = entity.getContent();
long length = entity.getContentLength(); long length = entity.getContentLength();
try (response) { try {
if (length > 0 && length < 8192) { if (length > 0 && length < 8192) {
// If the content is small and not compressed, we can just read it into memory // If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length); return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
} else { } else {
// Otherwise, we unpack it into a file and read it from there // Otherwise, we unpack it into a file and read it from there
return new FileBuffer(response.getHeaders(), timeLimit, is); return new FileBuffer(response.getHeaders(), request, timeLimit, is);
}
}
finally {
try {
is.skip(Long.MAX_VALUE);
}
catch (IOException e) {
// Ignore the exception
}
finally {
// Close the input stream
IOUtils.closeQuietly(is);
} }
} }
@@ -71,7 +86,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
/** Copy an input stream to an output stream, with a maximum size and time limit */ /** Copy an input stream to an output stream, with a maximum size and time limit */
protected void copy(InputStream is, OutputStream os, Duration timeLimit) { protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
Instant start = Instant.now(); Instant start = Instant.now();
Instant timeout = start.plus(timeLimit); Instant timeout = start.plus(timeLimit);
long size = 0; long size = 0;
@@ -86,6 +101,10 @@ public abstract class WarcInputBuffer implements AutoCloseable {
Duration remaining = Duration.between(Instant.now(), timeout); Duration remaining = Duration.between(Instant.now(), timeout);
if (remaining.isNegative()) { if (remaining.isNegative()) {
truncationReason = WarcTruncationReason.TIME; truncationReason = WarcTruncationReason.TIME;
// Abort the request if the time limit is exceeded
// so we don't keep the connection open forever or are forced to consume
// the stream to the end
request.abort();
break; break;
} }
@@ -104,6 +123,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
else if (truncationReason != WarcTruncationReason.LENGTH) { else if (truncationReason != WarcTruncationReason.LENGTH) {
truncationReason = WarcTruncationReason.LENGTH; truncationReason = WarcTruncationReason.LENGTH;
break;
} }
} catch (IOException e) { } catch (IOException e) {
@@ -111,13 +131,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
} }
// Try to close the connection as long as we haven't timed out.
// As per Apache HttpClient's semantics, this will reset the connection
// and close the stream if we have timed out.
if (truncationReason != WarcTruncationReason.TIME) {
IOUtils.closeQuietly(is);
}
} }
/** Takes a Content-Range header and checks if it is complete. /** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +231,7 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */ /** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer { class MemoryBuffer extends WarcInputBuffer {
byte[] data; byte[] data;
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) { public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
super(suppressContentEncoding(headers)); super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) { if (!isRangeComplete(headers)) {
@@ -229,7 +242,7 @@ class MemoryBuffer extends WarcInputBuffer {
var outputStream = new ByteArrayOutputStream(size); var outputStream = new ByteArrayOutputStream(size);
copy(responseStream, outputStream, timeLimit); copy(responseStream, request, outputStream, timeLimit);
data = outputStream.toByteArray(); data = outputStream.toByteArray();
} }
@@ -253,7 +266,7 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer { class FileBuffer extends WarcInputBuffer {
private final Path tempFile; private final Path tempFile;
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException { public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers)); super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) { if (!isRangeComplete(headers)) {
@@ -265,7 +278,7 @@ class FileBuffer extends WarcInputBuffer {
this.tempFile = Files.createTempFile("rsp", ".html"); this.tempFile = Files.createTempFile("rsp", ".html");
try (var out = Files.newOutputStream(tempFile)) { try (var out = Files.newOutputStream(tempFile)) {
copy(responseStream, out, timeLimit); copy(responseStream, request, out, timeLimit);
} }
catch (Exception ex) { catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED; truncationReason = WarcTruncationReason.UNSPECIFIED;

View File

@@ -8,9 +8,9 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore; import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.core5.http.NameValuePair; import org.apache.hc.core5.http.NameValuePair;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*; import org.netpreserve.jwarc.*;
@@ -89,14 +89,14 @@ public class WarcRecorder implements AutoCloseable {
} }
public HttpFetchResult fetch(HttpClient client, public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request) HttpGet request)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{ {
return fetch(client, request, Duration.ofMillis(MAX_TIME)); return fetch(client, request, Duration.ofMillis(MAX_TIME));
} }
public HttpFetchResult fetch(HttpClient client, public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request, HttpGet request,
Duration timeout) Duration timeout)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{ {
@@ -117,7 +117,7 @@ public class WarcRecorder implements AutoCloseable {
try { try {
return client.execute(request, response -> { return client.execute(request, response -> {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout); try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) { InputStream inputStream = inputBuffer.read()) {
// Build and write the request // Build and write the request
@@ -237,6 +237,7 @@ public class WarcRecorder implements AutoCloseable {
dataStart, dataStart,
responseDataBuffer.length() - dataStart); responseDataBuffer.length() - dataStart);
} catch (Exception ex) { } catch (Exception ex) {
ex.printStackTrace();
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage()); logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
@@ -249,6 +250,7 @@ public class WarcRecorder implements AutoCloseable {
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);
} catch (IOException ex) { } catch (IOException ex) {
ex.printStackTrace();
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage()); logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
return new HttpFetchResult.ResultException(ex); return new HttpFetchResult.ResultException(ex);

View File

@@ -53,7 +53,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private final CrawlerRevisitor crawlerRevisitor; private final CrawlerRevisitor crawlerRevisitor;
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle( private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
); );
int errorCount = 0; int errorCount = 0;

View File

@@ -91,14 +91,14 @@ class HttpFetcherImplContentTypeProbeTest {
@Test @Test
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException { public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty()); var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result); Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
} }
@Test @Test
public void testProbeContentTypeHtmlShortcircuitTags() { public void testProbeContentTypeHtmlShortcircuitTags() {
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b")); var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result); Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
} }
@Test @Test

View File

@@ -4,9 +4,9 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
} }
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = ClassicRequestBuilder.get(new java.net.URI(url)) HttpGet request = new HttpGet(url);
.addHeader("User-agent", "test.marginalia.nu") request.addHeader("User-agent", "test.marginalia.nu");
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.build();
recorder.fetch(httpClient, req); recorder.fetch(httpClient, request);
} }
} }

View File

@@ -3,9 +3,9 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer; import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.WarcReader; import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest; import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
os.write("<html><body>hello</body></html>".getBytes()); os.write("<html><body>hello</body></html>".getBytes());
os.flush(); os.flush();
try { try {
TimeUnit.SECONDS.sleep(1); TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) { } catch (InterruptedException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
os.write(":".getBytes()); os.write(":".getBytes());
os.flush(); os.flush();
try { try {
TimeUnit.SECONDS.sleep(1); TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) { } catch (InterruptedException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@@ -100,13 +100,10 @@ class WarcRecorderFakeServerTest {
@Test @Test
public void fetchFast() throws Exception { public void fetchFast() throws Exception {
client.fetch(httpClient, HttpGet request = new HttpGet("http://localhost:14510/fast");
ClassicRequestBuilder request.addHeader("User-agent", "test.marginalia.nu");
.get(new java.net.URI("http://localhost:14510/fast")) request.addHeader("Accept-Encoding", "gzip");
.addHeader("User-agent", "test.marginalia.nu") client.fetch(httpClient, request);
.addHeader("Accept-Encoding", "gzip")
.build()
);
Map<String, String> sampleData = new HashMap<>(); Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) { try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +124,12 @@ class WarcRecorderFakeServerTest {
public void fetchSlow() throws Exception { public void fetchSlow() throws Exception {
Instant start = Instant.now(); Instant start = Instant.now();
HttpGet request = new HttpGet("http://localhost:14510/slow");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow")) request,
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build(),
Duration.ofSeconds(1) Duration.ofSeconds(1)
); );
Instant end = Instant.now(); Instant end = Instant.now();
@@ -149,6 +147,8 @@ class WarcRecorderFakeServerTest {
}); });
} }
System.out.println(
Files.readString(fileNameWarc));
System.out.println(sampleData); System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond, // Timeout is set to 1 second, but the server will take 5 seconds to respond,

View File

@@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -52,11 +52,14 @@ class WarcRecorderTest {
@Test @Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
HttpGet request = new HttpGet("https://www.marginalia.nu/");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request);
client.fetch(httpClient, client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/")) request
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
); );
Map<String, String> sampleData = new HashMap<>(); Map<String, String> sampleData = new HashMap<>();
@@ -138,23 +141,23 @@ class WarcRecorderTest {
@Test @Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, ClassicRequestBuilder HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
.get(new java.net.URI("https://www.marginalia.nu/")) request1.addHeader("User-agent", "test.marginalia.nu");
.addHeader("User-agent", "test.marginalia.nu") request1.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, ClassicRequestBuilder client.fetch(httpClient, request1);
.get(new java.net.URI("https://www.marginalia.nu/log/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, ClassicRequestBuilder HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
.get(new java.net.URI("https://www.marginalia.nu/sanic.png")) request2.addHeader("User-agent", "test.marginalia.nu");
.addHeader("User-agent", "test.marginalia.nu") request2.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Encoding", "gzip")
.build()); client.fetch(httpClient, request2);
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
request3.addHeader("User-agent", "test.marginalia.nu");
request3.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request3);
CrawledDocumentParquetRecordFileWriter.convertWarc( CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu", "www.marginalia.nu",