1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

10 Commits

Author SHA1 Message Date
Viktor Lofgren
f4ad7145db (crawler) Disable SO_LINGER 2025-04-18 01:42:02 +02:00
Viktor Lofgren
068b450180 (crawler) Temporarily disable request.abort() 2025-04-18 01:25:56 +02:00
Viktor Lofgren
05b909a21f (crawler) Add logging to get more info about connection leak 2025-04-18 01:06:52 +02:00
Viktor Lofgren
3d179cddce (crawler) Correctly consume entity in sitemap retrieval 2025-04-18 00:32:21 +02:00
Viktor Lofgren
1a2aae496a (crawler) Correct handling and abortion of HttpClient's requests
There was a resource leak in the initial implementation of the Apache HttpClient WarcInputBuffer that failed to free up resources.

Using HttpGet objects instead of the Classic...Request objects, as the latter fail to expose an abort()-method.
2025-04-18 00:16:26 +02:00
Viktor Lofgren
353cdffb3f (crawler) Increase connection request timeout, restore congestion timeout 2025-04-17 21:32:06 +02:00
Viktor Lofgren
2e3f1313c7 (crawler) Log exceptions while crawling in crawler audit log 2025-04-17 21:18:09 +02:00
Viktor Lofgren
58e6f141ce (crawler) Reduce congestion throttle go-rate 2025-04-17 20:36:58 +02:00
Viktor Lofgren
500f63e921 (crawler) Lower max conn per route 2025-04-17 18:36:16 +02:00
Viktor Lofgren
6dfbedda1e (crawler) Increase max conn per route and connection timeout 2025-04-17 18:31:46 +02:00
11 changed files with 199 additions and 132 deletions

View File

@@ -1,6 +1,6 @@
package nu.marginalia.crawl.fetcher;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.apache.hc.client5.http.classic.methods.HttpGet;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
}
/** Paints the tags onto the request builder. */
public void paint(ClassicRequestBuilder getBuilder) {
public void paint(HttpGet request) {
if (etag != null) {
getBuilder.addHeader("If-None-Match", etag);
request.addHeader("If-None-Match", etag);
}
if (lastMod != null) {
getBuilder.addHeader("If-Modified-Since", lastMod);
request.addHeader("If-Modified-Since", lastMod);
}
}
}

View File

@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.apache.hc.core5.http.message.MessageSupport;
import org.apache.hc.core5.http.protocol.HttpContext;
import org.apache.hc.core5.pool.PoolStats;
import org.apache.hc.core5.util.TimeValue;
import org.apache.hc.core5.util.Timeout;
import org.jsoup.Jsoup;
@@ -76,30 +78,48 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
private final CloseableHttpClient client;
private PoolingHttpClientConnectionManager connectionManager;
public PoolStats getPoolStats() {
return connectionManager.getTotalStats();
}
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setSocketTimeout(10, TimeUnit.SECONDS)
.setConnectTimeout(10, TimeUnit.SECONDS)
.setConnectTimeout(30, TimeUnit.SECONDS)
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build();
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(4)
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2)
.setMaxConnTotal(5000)
.setDefaultConnectionConfig(connectionConfig)
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
.build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(15))
.setSoLinger(TimeValue.ofSeconds(-1))
.setSoTimeout(Timeout.ofSeconds(10))
.build()
);
Thread.ofPlatform().daemon(true).start(() -> {
try {
for (;;) {
TimeUnit.SECONDS.sleep(15);
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.RELAXED)
.setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build();
return HttpClients.custom()
@@ -398,16 +418,16 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
HttpGet request = new HttpGet(url.asURI());
request.addHeader("User-Agent", userAgentString);
request.addHeader("Accept-Encoding", "gzip");
request.addHeader("Accept-Language", "en,*;q=0.5");
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
contentTags.paint(getBuilder);
contentTags.paint(request);
try (var sl = new SendLock()) {
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
HttpFetchResult result = warcRecorder.fetch(client, request);
if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) {
@@ -419,7 +439,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url);
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
}
@@ -494,16 +514,17 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.addHeader("User-Agent", userAgentString)
.build();
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
getRequest.addHeader("User-Agent", userAgentString);
getRequest.addHeader("Accept-Encoding", "gzip");
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
getRequest.addHeader("User-Agent", userAgentString);
try (var sl = new SendLock()) {
return client.execute(getRequest, response -> {
try {
if (response.getCode() != 200) {
return new SitemapResult.SitemapError();
}
@@ -544,6 +565,10 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
}
default -> new SitemapResult.SitemapError();
};
}
finally {
EntityUtils.consume(response.getEntity());
}
});
}
catch (Exception ex) {
@@ -574,11 +599,10 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try (var sl = new SendLock()) {
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.build();
HttpGet request = new HttpGet(url.asURI());
request.addHeader("User-Agent", userAgentString);
request.addHeader("Accept-Encoding", "gzip");
request.addHeader("Accept", "text/*, */*;q=0.9");
HttpFetchResult result = recorder.fetch(client, request);

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.ClassicHttpResponse;
import org.apache.hc.core5.http.Header;
import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status.
*/
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
static WarcInputBuffer forResponse(ClassicHttpResponse response,
HttpGet request,
Duration timeLimit) throws IOException {
if (response == null)
return new ErrorBuffer();
@@ -54,16 +57,29 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer();
}
InputStream is = entity.getContent();
InputStream is = null;
try {
is = entity.getContent();
long length = entity.getContentLength();
try (response) {
if (length > 0 && length < 8192) {
// If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
} else {
// Otherwise, we unpack it into a file and read it from there
return new FileBuffer(response.getHeaders(), timeLimit, is);
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
}
}
finally {
try {
is.skip(Long.MAX_VALUE);
}
catch (IOException e) {
// Ignore the exception
}
finally {
// Close the input stream
IOUtils.closeQuietly(is);
}
}
@@ -71,7 +87,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
/** Copy an input stream to an output stream, with a maximum size and time limit */
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
Instant start = Instant.now();
Instant timeout = start.plus(timeLimit);
long size = 0;
@@ -86,6 +102,12 @@ public abstract class WarcInputBuffer implements AutoCloseable {
Duration remaining = Duration.between(Instant.now(), timeout);
if (remaining.isNegative()) {
truncationReason = WarcTruncationReason.TIME;
// Abort the request if the time limit is exceeded
// so we don't keep the connection open forever or are forced to consume
// the stream to the end
// FIXME: Disable this for now, as it may cause issues with the connection pool
// request.abort();
break;
}
@@ -104,6 +126,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
else if (truncationReason != WarcTruncationReason.LENGTH) {
truncationReason = WarcTruncationReason.LENGTH;
break;
}
} catch (IOException e) {
@@ -111,13 +134,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
}
}
// Try to close the connection as long as we haven't timed out.
// As per Apache HttpClient's semantics, this will reset the connection
// and close the stream if we have timed out.
if (truncationReason != WarcTruncationReason.TIME) {
IOUtils.closeQuietly(is);
}
}
/** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +234,7 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer {
byte[] data;
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) {
@@ -229,7 +245,7 @@ class MemoryBuffer extends WarcInputBuffer {
var outputStream = new ByteArrayOutputStream(size);
copy(responseStream, outputStream, timeLimit);
copy(responseStream, request, outputStream, timeLimit);
data = outputStream.toByteArray();
}
@@ -253,7 +269,7 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer {
private final Path tempFile;
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) {
@@ -265,7 +281,7 @@ class FileBuffer extends WarcInputBuffer {
this.tempFile = Files.createTempFile("rsp", ".html");
try (var out = Files.newOutputStream(tempFile)) {
copy(responseStream, out, timeLimit);
copy(responseStream, request, out, timeLimit);
}
catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED;

View File

@@ -8,9 +8,9 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.core5.http.NameValuePair;
import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*;
@@ -89,14 +89,14 @@ public class WarcRecorder implements AutoCloseable {
}
public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request)
HttpGet request)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{
return fetch(client, request, Duration.ofMillis(MAX_TIME));
}
public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request,
HttpGet request,
Duration timeout)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{
@@ -117,7 +117,7 @@ public class WarcRecorder implements AutoCloseable {
try {
return client.execute(request, response -> {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) {
// Build and write the request

View File

@@ -53,7 +53,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private final CrawlerRevisitor crawlerRevisitor;
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
);
int errorCount = 0;

View File

@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class HttpFetcherImplContentTypeProbeTest {
@@ -85,20 +87,24 @@ class HttpFetcherImplContentTypeProbeTest {
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close();
}
@Test
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
}
@Test
public void testProbeContentTypeHtmlShortcircuitTags() {
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
}
@Test

View File

@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException;
import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class HttpFetcherImplDomainProbeTest {
@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close();
}

View File

@@ -139,12 +139,23 @@ class HttpFetcherImplFetchTest {
@AfterEach
public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
System.out.println(stats);
fetcher.close();
warcRecorder.close();
Files.deleteIfExists(warcFile);
}
@Test
public void testFoo() {
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
}
@Test
public void testOk_NoProbe() throws IOException {
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

View File

@@ -4,9 +4,9 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
}
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = ClassicRequestBuilder.get(new java.net.URI(url))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build();
recorder.fetch(httpClient, req);
HttpGet request = new HttpGet(url);
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
recorder.fetch(httpClient, request);
}
}

View File

@@ -3,9 +3,9 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
os.write("<html><body>hello</body></html>".getBytes());
os.flush();
try {
TimeUnit.SECONDS.sleep(1);
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
os.write(":".getBytes());
os.flush();
try {
TimeUnit.SECONDS.sleep(1);
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
@@ -94,19 +94,17 @@ class WarcRecorderFakeServerTest {
@AfterEach
public void tearDown() throws Exception {
client.close();
Files.delete(fileNameWarc);
}
@Test
public void fetchFast() throws Exception {
client.fetch(httpClient,
ClassicRequestBuilder
.get(new java.net.URI("http://localhost:14510/fast"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
);
HttpGet request = new HttpGet("http://localhost:14510/fast");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request);
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +125,12 @@ class WarcRecorderFakeServerTest {
public void fetchSlow() throws Exception {
Instant start = Instant.now();
HttpGet request = new HttpGet("http://localhost:14510/slow");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build(),
request,
Duration.ofSeconds(1)
);
Instant end = Instant.now();
@@ -149,6 +148,8 @@ class WarcRecorderFakeServerTest {
});
}
System.out.println(
Files.readString(fileNameWarc));
System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond,

View File

@@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -52,11 +52,14 @@ class WarcRecorderTest {
@Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
HttpGet request = new HttpGet("https://www.marginalia.nu/");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request);
client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
request
);
Map<String, String> sampleData = new HashMap<>();
@@ -138,23 +141,23 @@ class WarcRecorderTest {
@Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
request1.addHeader("User-agent", "test.marginalia.nu");
request1.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/log/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, request1);
client.fetch(httpClient, ClassicRequestBuilder
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
request2.addHeader("User-agent", "test.marginalia.nu");
request2.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request2);
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
request3.addHeader("User-agent", "test.marginalia.nu");
request3.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request3);
CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu",