mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
4 Commits
deploy-012
...
deploy-012
Author | SHA1 | Date | |
---|---|---|---|
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce |
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
|
|
||||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||||
public record ContentTags(String etag, String lastMod) {
|
public record ContentTags(String etag, String lastMod) {
|
||||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Paints the tags onto the request builder. */
|
/** Paints the tags onto the request builder. */
|
||||||
public void paint(ClassicRequestBuilder getBuilder) {
|
public void paint(HttpGet request) {
|
||||||
|
|
||||||
if (etag != null) {
|
if (etag != null) {
|
||||||
getBuilder.addHeader("If-None-Match", etag);
|
request.addHeader("If-None-Match", etag);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastMod != null) {
|
if (lastMod != null) {
|
||||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
request.addHeader("If-Modified-Since", lastMod);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -17,6 +17,7 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
|||||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||||
import org.apache.hc.client5.http.classic.HttpClient;
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||||
import org.apache.hc.client5.http.config.RequestConfig;
|
import org.apache.hc.client5.http.config.RequestConfig;
|
||||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
@@ -99,7 +100,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
|||||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||||
.setConnectionRequestTimeout(8, TimeUnit.SECONDS)
|
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
return HttpClients.custom()
|
return HttpClients.custom()
|
||||||
@@ -398,16 +399,16 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassicRequestBuilder getBuilder = ClassicRequestBuilder.get(url.asURI())
|
HttpGet request = new HttpGet(url.asURI());
|
||||||
.addHeader("User-Agent", userAgentString)
|
request.addHeader("User-Agent", userAgentString);
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(request);
|
||||||
|
|
||||||
try (var sl = new SendLock()) {
|
try (var sl = new SendLock()) {
|
||||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
HttpFetchResult result = warcRecorder.fetch(client, request);
|
||||||
|
|
||||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
if (ok.statusCode() == 304) {
|
if (ok.statusCode() == 304) {
|
||||||
@@ -419,7 +420,7 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
|||||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {}", ok.statusCode(), url);
|
||||||
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception: {} for {}", ex.getClass().getSimpleName(), url);
|
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for " + url + ": {}", ex.ex());
|
||||||
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||||
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||||
}
|
}
|
||||||
@@ -494,13 +495,13 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||||
ClassicHttpRequest getRequest = ClassicRequestBuilder.get(sitemapUrl.asURI())
|
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||||
.addHeader("User-Agent", userAgentString)
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
getRequest.addHeader("User-Agent", userAgentString);
|
||||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("User-Agent", userAgentString)
|
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||||
.build();
|
getRequest.addHeader("User-Agent", userAgentString);
|
||||||
|
|
||||||
try (var sl = new SendLock()) {
|
try (var sl = new SendLock()) {
|
||||||
return client.execute(getRequest, response -> {
|
return client.execute(getRequest, response -> {
|
||||||
@@ -574,11 +575,10 @@ public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
|||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try (var sl = new SendLock()) {
|
try (var sl = new SendLock()) {
|
||||||
|
|
||||||
ClassicHttpRequest request = ClassicRequestBuilder.get(url.asURI())
|
HttpGet request = new HttpGet(url.asURI());
|
||||||
.addHeader("User-Agent", userAgentString)
|
request.addHeader("User-Agent", userAgentString);
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||||
.build();
|
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, request);
|
HttpFetchResult result = recorder.fetch(client, request);
|
||||||
|
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.fetcher.warc;
|
|||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||||
import org.apache.hc.core5.http.Header;
|
import org.apache.hc.core5.http.Header;
|
||||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||||
@@ -43,7 +44,9 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(ClassicHttpResponse response, Duration timeLimit) throws IOException {
|
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||||
|
HttpGet request,
|
||||||
|
Duration timeLimit) throws IOException {
|
||||||
if (response == null)
|
if (response == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
@@ -57,13 +60,25 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
InputStream is = entity.getContent();
|
InputStream is = entity.getContent();
|
||||||
long length = entity.getContentLength();
|
long length = entity.getContentLength();
|
||||||
|
|
||||||
try (response) {
|
try {
|
||||||
if (length > 0 && length < 8192) {
|
if (length > 0 && length < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(response.getHeaders(), timeLimit, is, (int) length);
|
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||||
} else {
|
} else {
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(response.getHeaders(), timeLimit, is);
|
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
try {
|
||||||
|
is.skip(Long.MAX_VALUE);
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
// Ignore the exception
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
// Close the input stream
|
||||||
|
IOUtils.closeQuietly(is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -71,7 +86,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||||
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||||
Instant start = Instant.now();
|
Instant start = Instant.now();
|
||||||
Instant timeout = start.plus(timeLimit);
|
Instant timeout = start.plus(timeLimit);
|
||||||
long size = 0;
|
long size = 0;
|
||||||
@@ -86,6 +101,10 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
Duration remaining = Duration.between(Instant.now(), timeout);
|
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||||
if (remaining.isNegative()) {
|
if (remaining.isNegative()) {
|
||||||
truncationReason = WarcTruncationReason.TIME;
|
truncationReason = WarcTruncationReason.TIME;
|
||||||
|
// Abort the request if the time limit is exceeded
|
||||||
|
// so we don't keep the connection open forever or are forced to consume
|
||||||
|
// the stream to the end
|
||||||
|
request.abort();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,6 +123,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||||
truncationReason = WarcTruncationReason.LENGTH;
|
truncationReason = WarcTruncationReason.LENGTH;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@@ -111,13 +131,6 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to close the connection as long as we haven't timed out.
|
|
||||||
// As per Apache HttpClient's semantics, this will reset the connection
|
|
||||||
// and close the stream if we have timed out.
|
|
||||||
|
|
||||||
if (truncationReason != WarcTruncationReason.TIME) {
|
|
||||||
IOUtils.closeQuietly(is);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Takes a Content-Range header and checks if it is complete.
|
/** Takes a Content-Range header and checks if it is complete.
|
||||||
@@ -218,7 +231,7 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(Header[] headers, Duration timeLimit, InputStream responseStream, int size) {
|
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||||
super(suppressContentEncoding(headers));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
if (!isRangeComplete(headers)) {
|
if (!isRangeComplete(headers)) {
|
||||||
@@ -229,7 +242,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(responseStream, outputStream, timeLimit);
|
copy(responseStream, request, outputStream, timeLimit);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@@ -253,7 +266,7 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(Header[] headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(headers));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
if (!isRangeComplete(headers)) {
|
if (!isRangeComplete(headers)) {
|
||||||
@@ -265,7 +278,7 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
|
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(responseStream, out, timeLimit);
|
copy(responseStream, request, out, timeLimit);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
|
@@ -8,9 +8,9 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import org.apache.hc.client5.http.classic.HttpClient;
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
|
||||||
import org.apache.hc.core5.http.NameValuePair;
|
import org.apache.hc.core5.http.NameValuePair;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
@@ -89,14 +89,14 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(HttpClient client,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
ClassicHttpRequest request)
|
HttpGet request)
|
||||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
{
|
{
|
||||||
return fetch(client, request, Duration.ofMillis(MAX_TIME));
|
return fetch(client, request, Duration.ofMillis(MAX_TIME));
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(HttpClient client,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
ClassicHttpRequest request,
|
HttpGet request,
|
||||||
Duration timeout)
|
Duration timeout)
|
||||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
{
|
{
|
||||||
@@ -117,7 +117,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
return client.execute(request, response -> {
|
return client.execute(request, response -> {
|
||||||
|
|
||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, timeout);
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||||
InputStream inputStream = inputBuffer.read()) {
|
InputStream inputStream = inputBuffer.read()) {
|
||||||
|
|
||||||
// Build and write the request
|
// Build and write the request
|
||||||
@@ -237,6 +237,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
dataStart,
|
dataStart,
|
||||||
responseDataBuffer.length() - dataStart);
|
responseDataBuffer.length() - dataStart);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
@@ -249,6 +250,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||||
return new HttpFetchResult.ResultException(ex);
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
@@ -53,7 +53,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion by waiting 1 second between establishing them
|
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||||
);
|
);
|
||||||
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
@@ -91,14 +91,14 @@ class HttpFetcherImplContentTypeProbeTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
|
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new CrawlDelayTimer(50), ContentTags.empty());
|
||||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testProbeContentTypeHtmlShortcircuitTags() {
|
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
var result = fetcher.probeContentType(contentTypeBinaryUrl, new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Ok.class, result);
|
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.apache.hc.client5.http.classic.HttpClient;
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -78,10 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
var req = ClassicRequestBuilder.get(new java.net.URI(url))
|
HttpGet request = new HttpGet(url);
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
request.addHeader("User-agent", "test.marginalia.nu");
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
.build();
|
|
||||||
recorder.fetch(httpClient, req);
|
recorder.fetch(httpClient, request);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -3,9 +3,9 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
import com.sun.net.httpserver.HttpServer;
|
import com.sun.net.httpserver.HttpServer;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import org.apache.hc.client5.http.classic.HttpClient;
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.netpreserve.jwarc.WarcReader;
|
import org.netpreserve.jwarc.WarcReader;
|
||||||
import org.netpreserve.jwarc.WarcRequest;
|
import org.netpreserve.jwarc.WarcRequest;
|
||||||
@@ -51,14 +51,14 @@ class WarcRecorderFakeServerTest {
|
|||||||
os.write("<html><body>hello</body></html>".getBytes());
|
os.write("<html><body>hello</body></html>".getBytes());
|
||||||
os.flush();
|
os.flush();
|
||||||
try {
|
try {
|
||||||
TimeUnit.SECONDS.sleep(1);
|
TimeUnit.SECONDS.sleep(2);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
os.write(":".getBytes());
|
os.write(":".getBytes());
|
||||||
os.flush();
|
os.flush();
|
||||||
try {
|
try {
|
||||||
TimeUnit.SECONDS.sleep(1);
|
TimeUnit.SECONDS.sleep(2);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
@@ -100,13 +100,10 @@ class WarcRecorderFakeServerTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void fetchFast() throws Exception {
|
public void fetchFast() throws Exception {
|
||||||
client.fetch(httpClient,
|
HttpGet request = new HttpGet("http://localhost:14510/fast");
|
||||||
ClassicRequestBuilder
|
request.addHeader("User-agent", "test.marginalia.nu");
|
||||||
.get(new java.net.URI("http://localhost:14510/fast"))
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
client.fetch(httpClient, request);
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build()
|
|
||||||
);
|
|
||||||
|
|
||||||
Map<String, String> sampleData = new HashMap<>();
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
@@ -127,11 +124,12 @@ class WarcRecorderFakeServerTest {
|
|||||||
public void fetchSlow() throws Exception {
|
public void fetchSlow() throws Exception {
|
||||||
Instant start = Instant.now();
|
Instant start = Instant.now();
|
||||||
|
|
||||||
|
HttpGet request = new HttpGet("http://localhost:14510/slow");
|
||||||
|
request.addHeader("User-agent", "test.marginalia.nu");
|
||||||
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
|
|
||||||
client.fetch(httpClient,
|
client.fetch(httpClient,
|
||||||
ClassicRequestBuilder.get(new java.net.URI("http://localhost:14510/slow"))
|
request,
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build(),
|
|
||||||
Duration.ofSeconds(1)
|
Duration.ofSeconds(1)
|
||||||
);
|
);
|
||||||
Instant end = Instant.now();
|
Instant end = Instant.now();
|
||||||
@@ -149,6 +147,8 @@ class WarcRecorderFakeServerTest {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
System.out.println(
|
||||||
|
Files.readString(fileNameWarc));
|
||||||
System.out.println(sampleData);
|
System.out.println(sampleData);
|
||||||
|
|
||||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||||
|
@@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
import org.apache.hc.client5.http.classic.HttpClient;
|
import org.apache.hc.client5.http.classic.HttpClient;
|
||||||
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -52,11 +52,14 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
|
|
||||||
|
HttpGet request = new HttpGet("https://www.marginalia.nu/");
|
||||||
|
request.addHeader("User-agent", "test.marginalia.nu");
|
||||||
|
request.addHeader("Accept-Encoding", "gzip");
|
||||||
|
client.fetch(httpClient, request);
|
||||||
|
|
||||||
client.fetch(httpClient,
|
client.fetch(httpClient,
|
||||||
ClassicRequestBuilder.get(new java.net.URI("https://www.marginalia.nu/"))
|
request
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build()
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Map<String, String> sampleData = new HashMap<>();
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
@@ -138,23 +141,23 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, ClassicRequestBuilder
|
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
|
||||||
.get(new java.net.URI("https://www.marginalia.nu/"))
|
request1.addHeader("User-agent", "test.marginalia.nu");
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
request1.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build());
|
|
||||||
|
|
||||||
client.fetch(httpClient, ClassicRequestBuilder
|
client.fetch(httpClient, request1);
|
||||||
.get(new java.net.URI("https://www.marginalia.nu/log/"))
|
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build());
|
|
||||||
|
|
||||||
client.fetch(httpClient, ClassicRequestBuilder
|
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
|
||||||
.get(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
request2.addHeader("User-agent", "test.marginalia.nu");
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
request2.addHeader("Accept-Encoding", "gzip");
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build());
|
client.fetch(httpClient, request2);
|
||||||
|
|
||||||
|
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
|
||||||
|
request3.addHeader("User-agent", "test.marginalia.nu");
|
||||||
|
request3.addHeader("Accept-Encoding", "gzip");
|
||||||
|
|
||||||
|
client.fetch(httpClient, request3);
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
"www.marginalia.nu",
|
"www.marginalia.nu",
|
||||||
|
Reference in New Issue
Block a user