1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

6 Commits

Author SHA1 Message Date
Viktor Lofgren
f4ad7145db (crawler) Disable SO_LINGER 2025-04-18 01:42:02 +02:00
Viktor Lofgren
068b450180 (crawler) Temporarily disable request.abort() 2025-04-18 01:25:56 +02:00
Viktor Lofgren
05b909a21f (crawler) Add logging to get more info about connection leak 2025-04-18 01:06:52 +02:00
Viktor Lofgren
3d179cddce (crawler) Correctly consume entity in sitemap retrieval 2025-04-18 00:32:21 +02:00
Viktor Lofgren
1a2aae496a (crawler) Correct handling and abortion of HttpClient's requests
There was a resource leak in the initial implementation of the Apache HttpClient WarcInputBuffer that failed to free up resources.

Using HttpGet objects instead of the Classic...Request objects, as the latter fail to expose an abort()-method.
2025-04-18 00:16:26 +02:00
Viktor Lofgren
353cdffb3f (crawler) Increase connection request timeout, restore congestion timeout 2025-04-17 21:32:06 +02:00
11 changed files with 196 additions and 129 deletions

View File

@@ -1,6 +1,6 @@
package nu.marginalia.crawl.fetcher; package nu.marginalia.crawl.fetcher;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder; import org.apache.hc.client5.http.classic.methods.HttpGet;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */ /** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) { public record ContentTags(String etag, String lastMod) {
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
} }
/** Paints the tags onto the request builder. */ /** Paints the tags onto the request builder. */
public void paint(ClassicRequestBuilder getBuilder) { public void paint(HttpGet request) {
if (etag != null) { if (etag != null) {
getBuilder.addHeader("If-None-Match", etag); request.addHeader("If-None-Match", etag);
} }
if (lastMod != null) { if (lastMod != null) {
getBuilder.addHeader("If-Modified-Since", lastMod); request.addHeader("If-Modified-Since", lastMod);
} }
} }
} }

View File

@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy; import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
import org.apache.hc.client5.http.HttpRequestRetryStrategy; import org.apache.hc.client5.http.HttpRequestRetryStrategy;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.config.ConnectionConfig;
import org.apache.hc.client5.http.config.RequestConfig; import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
@@ -34,6 +35,7 @@ import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder; import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.apache.hc.core5.http.message.MessageSupport; import org.apache.hc.core5.http.message.MessageSupport;
import org.apache.hc.core5.http.protocol.HttpContext; import org.apache.hc.core5.http.protocol.HttpContext;
import org.apache.hc.core5.pool.PoolStats;
import org.apache.hc.core5.util.TimeValue; import org.apache.hc.core5.util.TimeValue;
import org.apache.hc.core5.util.Timeout; import org.apache.hc.core5.util.Timeout;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
@@ -76,14 +78,20 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
} }
private final CloseableHttpClient client; private final CloseableHttpClient client;
private PoolingHttpClientConnectionManager connectionManager;
public PoolStats getPoolStats() {
return connectionManager.getTotalStats();
}
private CloseableHttpClient createClient() throws NoSuchAlgorithmException { private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
final ConnectionConfig connectionConfig = ConnectionConfig.custom() final ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setSocketTimeout(10, TimeUnit.SECONDS) .setSocketTimeout(10, TimeUnit.SECONDS)
.setConnectTimeout(30, TimeUnit.SECONDS) .setConnectTimeout(30, TimeUnit.SECONDS)
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
.build(); .build();
final PoolingHttpClientConnectionManager connectionManager = PoolingHttpClientConnectionManagerBuilder.create() connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
.setMaxConnPerRoute(2) .setMaxConnPerRoute(2)
.setMaxConnTotal(5000) .setMaxConnTotal(5000)
.setDefaultConnectionConfig(connectionConfig) .setDefaultConnectionConfig(connectionConfig)
@@ -91,15 +99,27 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
.build(); .build();
connectionManager.setDefaultSocketConfig(SocketConfig.custom() connectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setSoLinger(TimeValue.ofSeconds(15)) .setSoLinger(TimeValue.ofSeconds(-1))
.setSoTimeout(Timeout.ofSeconds(10)) .setSoTimeout(Timeout.ofSeconds(10))
.build() .build()
); );
Thread.ofPlatform().daemon(true).start(() -> {
try {
for (;;) {
TimeUnit.SECONDS.sleep(15);
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
}
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
final RequestConfig defaultRequestConfig = RequestConfig.custom() final RequestConfig defaultRequestConfig = RequestConfig.custom()
.setCookieSpec(StandardCookieSpec.RELAXED) .setCookieSpec(StandardCookieSpec.RELAXED)
.setResponseTimeout(10, TimeUnit.SECONDS) .setResponseTimeout(10, TimeUnit.SECONDS)
.setConnectionRequestTimeout(8, TimeUnit.SECONDS) .setConnectionRequestTimeout(5, TimeUnit.MINUTES)
.build(); .build();
return HttpClients.custom() return HttpClients.custom()
@@ -398,16 +418,16 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
} }
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI()) HttpGet request = new HttpGet(url.asURI());
.addHeader("User-Agent", userAgentString) request.addHeader("User-Agent", userAgentString);
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Language", "en,*;q=0.5") request.addHeader("Accept-Language", "en,*;q=0.5");
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8"); request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
contentTags.paint(getBuilder); contentTags.paint(request);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); HttpFetchResult result = warcRecorder.fetch(client, request);
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
@@ -494,56 +514,61 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
} }
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException { private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI()) HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
.addHeader("User-Agent", userAgentString)
.addHeader("Accept-Encoding", "gzip") getRequest.addHeader("User-Agent", userAgentString);
.addHeader("Accept", "text/*, */*;q=0.9") getRequest.addHeader("Accept-Encoding", "gzip");
.addHeader("User-Agent", userAgentString) getRequest.addHeader("Accept", "text/*, */*;q=0.9");
.build(); getRequest.addHeader("User-Agent", userAgentString);
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
return client.execute(getRequest, response -> { return client.execute(getRequest, response -> {
if (response.getCode() != 200) { try {
return new SitemapResult.SitemapError(); if (response.getCode() != 200) {
return new SitemapResult.SitemapError();
}
Document parsedSitemap = Jsoup.parse(
EntityUtils.toString(response.getEntity()),
sitemapUrl.toString(),
Parser.xmlParser()
);
if (parsedSitemap.childrenSize() == 0) {
return new SitemapResult.SitemapError();
}
String rootTagName = parsedSitemap.child(0).tagName();
return switch (rootTagName.toLowerCase()) {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(locTag.text().trim());
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
case "rss", "atom" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("link, url")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
default -> new SitemapResult.SitemapError();
};
} }
finally {
Document parsedSitemap = Jsoup.parse( EntityUtils.consume(response.getEntity());
EntityUtils.toString(response.getEntity()),
sitemapUrl.toString(),
Parser.xmlParser()
);
if (parsedSitemap.childrenSize() == 0) {
return new SitemapResult.SitemapError();
} }
String rootTagName = parsedSitemap.child(0).tagName();
return switch (rootTagName.toLowerCase()) {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(locTag.text().trim());
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
case "rss", "atom" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("link, url")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
default -> new SitemapResult.SitemapError();
};
}); });
} }
catch (Exception ex) { catch (Exception ex) {
@@ -574,11 +599,10 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) { private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try (var sl = new SendLock()) { try (var sl = new SendLock()) {
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI()) HttpGet request = new HttpGet(url.asURI());
.addHeader("User-Agent", userAgentString) request.addHeader("User-Agent", userAgentString);
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept", "text/*, */*;q=0.9") request.addHeader("Accept", "text/*, */*;q=0.9");
.build();
HttpFetchResult result = recorder.fetch(client, request); HttpFetchResult result = recorder.fetch(client, request);

View File

@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.BOMInputStream;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.core5.http.ClassicHttpResponse; import org.apache.hc.core5.http.ClassicHttpResponse;
import org.apache.hc.core5.http.Header; import org.apache.hc.core5.http.Header;
import org.netpreserve.jwarc.WarcTruncationReason; import org.netpreserve.jwarc.WarcTruncationReason;
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers. * and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status. * If an error occurs, a buffer will be created with no content and an error status.
*/ */
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException { static WarcInputBuffer forResponse(ClassicHttpResponse response,
HttpGet request,
Duration timeLimit) throws IOException {
if (response == null) if (response == null)
return new ErrorBuffer(); return new ErrorBuffer();
@@ -54,16 +57,29 @@ public abstract class WarcInputBuffer implements AutoCloseable {
return new ErrorBuffer(); return new ErrorBuffer();
} }
InputStream is = entity.getContent(); InputStream is = null;
long length = entity.getContentLength(); try {
is = entity.getContent();
long length = entity.getContentLength();
try (response) {
if (length > 0 && length < 8192) { if (length > 0 && length < 8192) {
// If the content is small and not compressed, we can just read it into memory // If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length); return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
} else { } else {
// Otherwise, we unpack it into a file and read it from there // Otherwise, we unpack it into a file and read it from there
return new FileBuffer(response.getHeaders(), timeLimit, is); return new FileBuffer(response.getHeaders(), request, timeLimit, is);
}
}
finally {
try {
is.skip(Long.MAX_VALUE);
}
catch (IOException e) {
// Ignore the exception
}
finally {
// Close the input stream
IOUtils.closeQuietly(is);
} }
} }
@@ -71,7 +87,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
/** Copy an input stream to an output stream, with a maximum size and time limit */ /** Copy an input stream to an output stream, with a maximum size and time limit */
protected void copy(InputStream is, OutputStream os, Duration timeLimit) { protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
Instant start = Instant.now(); Instant start = Instant.now();
Instant timeout = start.plus(timeLimit); Instant timeout = start.plus(timeLimit);
long size = 0; long size = 0;
@@ -86,6 +102,12 @@ public abstract class WarcInputBuffer implements AutoCloseable {
Duration remaining = Duration.between(Instant.now(), timeout); Duration remaining = Duration.between(Instant.now(), timeout);
if (remaining.isNegative()) { if (remaining.isNegative()) {
truncationReason = WarcTruncationReason.TIME; truncationReason = WarcTruncationReason.TIME;
// Abort the request if the time limit is exceeded
// so we don't keep the connection open forever or are forced to consume
// the stream to the end
// FIXME: Disable this for now, as it may cause issues with the connection pool
// request.abort();
break; break;
} }
@@ -104,6 +126,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
else if (truncationReason != WarcTruncationReason.LENGTH) { else if (truncationReason != WarcTruncationReason.LENGTH) {
truncationReason = WarcTruncationReason.LENGTH; truncationReason = WarcTruncationReason.LENGTH;
break;
} }
} catch (IOException e) { } catch (IOException e) {
@@ -111,13 +134,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
} }
} }
// Try to close the connection as long as we haven't timed out.
// As per Apache HttpClient's semantics, this will reset the connection
// and close the stream if we have timed out.
if (truncationReason != WarcTruncationReason.TIME) {
IOUtils.closeQuietly(is);
}
} }
/** Takes a Content-Range header and checks if it is complete. /** Takes a Content-Range header and checks if it is complete.
@@ -218,7 +234,7 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */ /** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer { class MemoryBuffer extends WarcInputBuffer {
byte[] data; byte[] data;
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) { public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
super(suppressContentEncoding(headers)); super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) { if (!isRangeComplete(headers)) {
@@ -229,7 +245,7 @@ class MemoryBuffer extends WarcInputBuffer {
var outputStream = new ByteArrayOutputStream(size); var outputStream = new ByteArrayOutputStream(size);
copy(responseStream, outputStream, timeLimit); copy(responseStream, request, outputStream, timeLimit);
data = outputStream.toByteArray(); data = outputStream.toByteArray();
} }
@@ -253,7 +269,7 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer { class FileBuffer extends WarcInputBuffer {
private final Path tempFile; private final Path tempFile;
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException { public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers)); super(suppressContentEncoding(headers));
if (!isRangeComplete(headers)) { if (!isRangeComplete(headers)) {
@@ -265,7 +281,7 @@ class FileBuffer extends WarcInputBuffer {
this.tempFile = Files.createTempFile("rsp", ".html"); this.tempFile = Files.createTempFile("rsp", ".html");
try (var out = Files.newOutputStream(tempFile)) { try (var out = Files.newOutputStream(tempFile)) {
copy(responseStream, out, timeLimit); copy(responseStream, request, out, timeLimit);
} }
catch (Exception ex) { catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED; truncationReason = WarcTruncationReason.UNSPECIFIED;

View File

@@ -8,9 +8,9 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore; import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.core5.http.ClassicHttpRequest;
import org.apache.hc.core5.http.NameValuePair; import org.apache.hc.core5.http.NameValuePair;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*; import org.netpreserve.jwarc.*;
@@ -89,14 +89,14 @@ public class WarcRecorder implements AutoCloseable {
} }
public HttpFetchResult fetch(HttpClient client, public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request) HttpGet request)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{ {
return fetch(client, request, Duration.ofMillis(MAX_TIME)); return fetch(client, request, Duration.ofMillis(MAX_TIME));
} }
public HttpFetchResult fetch(HttpClient client, public HttpFetchResult fetch(HttpClient client,
ClassicHttpRequest request, HttpGet request,
Duration timeout) Duration timeout)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{ {
@@ -117,7 +117,7 @@ public class WarcRecorder implements AutoCloseable {
try { try {
return client.execute(request, response -> { return client.execute(request, response -> {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout); try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) { InputStream inputStream = inputBuffer.read()) {
// Build and write the request // Build and write the request

View File

@@ -53,7 +53,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private final CrawlerRevisitor crawlerRevisitor; private final CrawlerRevisitor crawlerRevisitor;
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle( private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
Duration.ofMillis(50) // pace the connections to avoid network congestion at startup Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
); );
int errorCount = 0; int errorCount = 0;

View File

@@ -11,6 +11,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow") @Tag("slow")
class HttpFetcherImplContentTypeProbeTest { class HttpFetcherImplContentTypeProbeTest {
@@ -85,20 +87,24 @@ class HttpFetcherImplContentTypeProbeTest {
@AfterEach @AfterEach
public void tearDown() throws IOException { public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close(); fetcher.close();
} }
@Test @Test
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException { public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty()); var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result); Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
} }
@Test @Test
public void testProbeContentTypeHtmlShortcircuitTags() { public void testProbeContentTypeHtmlShortcircuitTags() {
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b")); var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result); Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
} }
@Test @Test

View File

@@ -12,6 +12,8 @@ import org.junit.jupiter.api.*;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow") @Tag("slow")
class HttpFetcherImplDomainProbeTest { class HttpFetcherImplDomainProbeTest {
@@ -47,6 +49,10 @@ class HttpFetcherImplDomainProbeTest {
@AfterEach @AfterEach
public void tearDown() throws IOException { public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
fetcher.close(); fetcher.close();
} }

View File

@@ -139,12 +139,23 @@ class HttpFetcherImplFetchTest {
@AfterEach @AfterEach
public void tearDown() throws IOException { public void tearDown() throws IOException {
var stats = fetcher.getPoolStats();
assertEquals(0, stats.getLeased());
assertEquals(0, stats.getPending());
System.out.println(stats);
fetcher.close(); fetcher.close();
warcRecorder.close(); warcRecorder.close();
Files.deleteIfExists(warcFile); Files.deleteIfExists(warcFile);
} }
@Test
public void testFoo() {
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
}
@Test @Test
public void testOk_NoProbe() throws IOException { public void testOk_NoProbe() throws IOException {
var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED); var result = fetcher.fetchContent(okUrl, warcRecorder, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);

View File

@@ -4,9 +4,9 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
} }
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = ClassicRequestBuilder.get(new java.net.URI(url)) HttpGet request = new HttpGet(url);
.addHeader("User-agent", "test.marginalia.nu") request.addHeader("User-agent", "test.marginalia.nu");
.addHeader("Accept-Encoding", "gzip") request.addHeader("Accept-Encoding", "gzip");
.build();
recorder.fetch(httpClient, req); recorder.fetch(httpClient, request);
} }
} }

View File

@@ -3,9 +3,9 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer; import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.netpreserve.jwarc.WarcReader; import org.netpreserve.jwarc.WarcReader;
import org.netpreserve.jwarc.WarcRequest; import org.netpreserve.jwarc.WarcRequest;
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
os.write("<html><body>hello</body></html>".getBytes()); os.write("<html><body>hello</body></html>".getBytes());
os.flush(); os.flush();
try { try {
TimeUnit.SECONDS.sleep(1); TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) { } catch (InterruptedException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
os.write(":".getBytes()); os.write(":".getBytes());
os.flush(); os.flush();
try { try {
TimeUnit.SECONDS.sleep(1); TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) { } catch (InterruptedException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@@ -94,19 +94,17 @@ class WarcRecorderFakeServerTest {
@AfterEach @AfterEach
public void tearDown() throws Exception { public void tearDown() throws Exception {
client.close(); client.close();
Files.delete(fileNameWarc); Files.delete(fileNameWarc);
} }
@Test @Test
public void fetchFast() throws Exception { public void fetchFast() throws Exception {
client.fetch(httpClient, HttpGet request = new HttpGet("http://localhost:14510/fast");
ClassicRequestBuilder request.addHeader("User-agent", "test.marginalia.nu");
.get(new java.net.URI("http://localhost:14510/fast")) request.addHeader("Accept-Encoding", "gzip");
.addHeader("User-agent", "test.marginalia.nu") client.fetch(httpClient, request);
.addHeader("Accept-Encoding", "gzip")
.build()
);
Map<String, String> sampleData = new HashMap<>(); Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) { try (var warcReader = new WarcReader(fileNameWarc)) {
@@ -127,11 +125,12 @@ class WarcRecorderFakeServerTest {
public void fetchSlow() throws Exception { public void fetchSlow() throws Exception {
Instant start = Instant.now(); Instant start = Instant.now();
HttpGet request = new HttpGet("http://localhost:14510/slow");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow")) request,
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build(),
Duration.ofSeconds(1) Duration.ofSeconds(1)
); );
Instant end = Instant.now(); Instant end = Instant.now();
@@ -149,6 +148,8 @@ class WarcRecorderFakeServerTest {
}); });
} }
System.out.println(
Files.readString(fileNameWarc));
System.out.println(sampleData); System.out.println(sampleData);
// Timeout is set to 1 second, but the server will take 5 seconds to respond, // Timeout is set to 1 second, but the server will take 5 seconds to respond,

View File

@@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import org.apache.hc.client5.http.classic.HttpClient; import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -52,11 +52,14 @@ class WarcRecorderTest {
@Test @Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
HttpGet request = new HttpGet("https://www.marginalia.nu/");
request.addHeader("User-agent", "test.marginalia.nu");
request.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request);
client.fetch(httpClient, client.fetch(httpClient,
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/")) request
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build()
); );
Map<String, String> sampleData = new HashMap<>(); Map<String, String> sampleData = new HashMap<>();
@@ -138,23 +141,23 @@ class WarcRecorderTest {
@Test @Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, ClassicRequestBuilder HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
.get(new java.net.URI("https://www.marginalia.nu/")) request1.addHeader("User-agent", "test.marginalia.nu");
.addHeader("User-agent", "test.marginalia.nu") request1.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, ClassicRequestBuilder client.fetch(httpClient, request1);
.get(new java.net.URI("https://www.marginalia.nu/log/"))
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.build());
client.fetch(httpClient, ClassicRequestBuilder HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
.get(new java.net.URI("https://www.marginalia.nu/sanic.png")) request2.addHeader("User-agent", "test.marginalia.nu");
.addHeader("User-agent", "test.marginalia.nu") request2.addHeader("Accept-Encoding", "gzip");
.addHeader("Accept-Encoding", "gzip")
.build()); client.fetch(httpClient, request2);
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
request3.addHeader("User-agent", "test.marginalia.nu");
request3.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, request3);
CrawledDocumentParquetRecordFileWriter.convertWarc( CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu", "www.marginalia.nu",