mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
47 Commits
deploy-011
...
deploy-014
Author | SHA1 | Date | |
---|---|---|---|
|
2a737c34bb | ||
|
90a577af82 | ||
|
f0c9b935d8 | ||
|
7b5493dd51 | ||
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e | ||
|
a7d91c8527 | ||
|
7151602124 | ||
|
884e33bd4a | ||
|
e84d5c497a | ||
|
2d2d3e2466 | ||
|
647dd9b12f | ||
|
de4e2849ce | ||
|
3c43f1954e | ||
|
fa2462ec39 | ||
|
f4ad7145db | ||
|
068b450180 | ||
|
05b909a21f | ||
|
3d179cddce | ||
|
1a2aae496a | ||
|
353cdffb3f | ||
|
2e3f1313c7 | ||
|
58e6f141ce | ||
|
500f63e921 | ||
|
6dfbedda1e | ||
|
9715ddb105 | ||
|
1fc6313a77 | ||
|
b1249d5b8a | ||
|
ef95d59b07 | ||
|
acdd8664f5 | ||
|
6b12eac58a | ||
|
bb3f1f395a | ||
|
b661beef41 | ||
|
9888c47f19 | ||
|
dcef7e955b | ||
|
b3973a1dd7 | ||
|
8bd05d6d90 | ||
|
59df8e356e | ||
|
7161162a35 | ||
|
d7c4c5141f | ||
|
88e9b8fb05 |
@@ -5,6 +5,7 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -13,9 +14,20 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
@@ -5,6 +5,7 @@
|
||||
<Filters>
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
@@ -17,6 +18,17 @@
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/crawler-audit-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/crawler-audit-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
<Pattern>%d{yyyy-MM-dd HH:mm:ss,SSS}: %msg{nolookups}%n</Pattern>
|
||||
</PatternLayout>
|
||||
<SizeBasedTriggeringPolicy size="100MB" />
|
||||
<Filters>
|
||||
<MarkerFilter marker="CRAWLER" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</RollingFile>
|
||||
</Appenders>
|
||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
Files.deleteIfExists(Path.of(tarFileName));
|
||||
|
||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
is.transferTo(os);
|
||||
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||
|
||||
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||
long size = urlConnection.getContentLengthLong();
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = is.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size'
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
os.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
|
||||
// Update progress bar
|
||||
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||
logger.error("Error downloading sample", ex);
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
urlConnection.disconnect();
|
||||
}
|
||||
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||
yield new Extract(fileStorageId, tarFileName);
|
||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ServiceEventLog eventLog)
|
||||
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.eventLog = eventLog;
|
||||
this.heartbeat = heartbeat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -87,6 +87,8 @@ dependencies {
|
||||
implementation libs.commons.compress
|
||||
implementation libs.sqlite
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -8,7 +8,6 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -247,7 +246,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
||||
try (var recorder = new WarcRecorder(fileName);
|
||||
var db = new DomainStateDb(dbTempFile))
|
||||
{
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||
|
@@ -60,10 +60,14 @@ dependencies {
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
testImplementation libs.wiremock
|
||||
|
||||
|
||||
|
||||
testImplementation project(':code:processes:test-data')
|
||||
}
|
||||
|
@@ -43,6 +43,7 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@@ -66,6 +67,8 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||
|
||||
private final LinkedBlockingQueue<CrawlTask> retryQueue = new LinkedBlockingQueue<>();
|
||||
|
||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||
private final HttpFetcherImpl fetcher;
|
||||
|
||||
@@ -277,12 +280,29 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
|
||||
// Schedule viable tasks for execution until list is empty
|
||||
while (!taskList.isEmpty()) {
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
for (int emptyRuns = 0;emptyRuns < 300;) {
|
||||
boolean hasTasks = !taskList.isEmpty();
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(50);
|
||||
// The order of these checks very important to avoid a race condition
|
||||
// where we miss a task that is put into the retry queue
|
||||
boolean hasRunningTasks = pool.getActiveCount() > 0;
|
||||
boolean hasRetryTasks = !retryQueue.isEmpty();
|
||||
|
||||
if (hasTasks || hasRetryTasks || hasRunningTasks) {
|
||||
retryQueue.drainTo(taskList);
|
||||
|
||||
// Try to submit any tasks that are in the retry queue (this will block if the pool is full)
|
||||
taskList.removeIf(this::trySubmitDeferredTask);
|
||||
|
||||
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
|
||||
// we might have no new viable tasks to run for hours on end
|
||||
TimeUnit.MILLISECONDS.sleep(5);
|
||||
} else {
|
||||
// We have no tasks to run, and no tasks in the retry queue
|
||||
// but we wait a bit to see if any new tasks come in via the retry queue
|
||||
emptyRuns++;
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -414,7 +434,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
return domainLocks.isLockableHint(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -425,66 +445,82 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
return;
|
||||
}
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference()
|
||||
)
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
Optional<DomainLocks.DomainLock> lock = domainLocks.tryLockDomain(new EdgeDomain(domain));
|
||||
// We don't have a lock, so we can't run this task
|
||||
// we return to avoid blocking the pool for too long
|
||||
if (lock.isEmpty()) {
|
||||
if (retryQueue.remainingCapacity() > 0) {
|
||||
// Sleep a moment to avoid busy looping via the retry queue
|
||||
// in the case when few tasks remain and almost all are ineligible for
|
||||
// immediate restart
|
||||
Thread.sleep(5);
|
||||
}
|
||||
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
retryQueue.put(this);
|
||||
return;
|
||||
}
|
||||
DomainLocks.DomainLock domainLock = lock.get();
|
||||
|
||||
int size;
|
||||
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
|
||||
size = retriever.crawlDomain(domainLinks, reference);
|
||||
try (domainLock) {
|
||||
Thread.currentThread().setName("crawling:" + domain);
|
||||
|
||||
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
|
||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
|
||||
|
||||
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||
// while writing to the same file name as before
|
||||
if (Files.exists(newWarcFile)) {
|
||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
else {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference())
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
if (Files.exists(tempFile)) {
|
||||
retriever.syncAbortedRun(tempFile);
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
// Convert the WARC file to Parquet
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
int size = retriever.crawlDomain(domainLinks, reference);
|
||||
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
// Convert the WARC file to Slop
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
// Optionally archive the WARC file if full retention is enabled,
|
||||
// otherwise delete it:
|
||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
// Mark the domain as finished in the work log
|
||||
workLog.setJobToFinished(domain, slopFile.toString(), size);
|
||||
|
||||
// Update the progress bar
|
||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||
|
||||
logger.info("Fetched {}", domain);
|
||||
} catch (Exception e) {
|
||||
logger.error("Error fetching domain " + domain, e);
|
||||
}
|
||||
finally {
|
||||
// We don't need to double-count these; it's also kept in the workLog
|
||||
pendingCrawlTasks.remove(domain);
|
||||
Thread.currentThread().setName("[idle]");
|
||||
|
||||
Files.deleteIfExists(newWarcFile);
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.net.http.HttpRequest;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(HttpRequest.Builder getBuilder) {
|
||||
public void paint(HttpGet request) {
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.header("If-None-Match", etag);
|
||||
request.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.header("If-Modified-Since", lastMod);
|
||||
request.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,34 +0,0 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class Cookies extends CookieHandler {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public void clear() {
|
||||
cookieJar.get().clear();
|
||||
}
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookieJar.get().isEmpty();
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||
return cookieJar.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||
cookieJar.get().putAll(responseHeaders);
|
||||
}
|
||||
}
|
@@ -0,0 +1,56 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import org.apache.hc.client5.http.classic.methods.HttpUriRequestBase;
|
||||
import org.apache.hc.core5.http.ClassicHttpRequest;
|
||||
import org.apache.hc.core5.http.HttpResponse;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DomainCookies {
|
||||
private final Map<String, String> cookies = new HashMap<>();
|
||||
|
||||
public boolean hasCookies() {
|
||||
return !cookies.isEmpty();
|
||||
}
|
||||
|
||||
public void updateCookieStore(HttpResponse response) {
|
||||
for (var header : response.getHeaders()) {
|
||||
if (header.getName().equalsIgnoreCase("Set-Cookie")) {
|
||||
parseCookieHeader(header.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void parseCookieHeader(String value) {
|
||||
// Parse the Set-Cookie header value and extract the cookies
|
||||
|
||||
String[] parts = value.split(";");
|
||||
String cookie = parts[0].trim();
|
||||
|
||||
if (cookie.contains("=")) {
|
||||
String[] cookieParts = cookie.split("=");
|
||||
String name = cookieParts[0].trim();
|
||||
String val = cookieParts[1].trim();
|
||||
cookies.put(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
public void paintRequest(HttpUriRequestBase request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
public void paintRequest(ClassicHttpRequest request) {
|
||||
request.addHeader("Cookie", createCookieHeader());
|
||||
}
|
||||
|
||||
private String createCookieHeader() {
|
||||
StringJoiner sj = new StringJoiner("; ");
|
||||
for (var cookie : cookies.entrySet()) {
|
||||
sj.add(cookie.getKey() + "=" + cookie.getValue());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
}
|
@@ -8,6 +8,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@@ -15,20 +16,17 @@ import java.util.List;
|
||||
public interface HttpFetcher extends AutoCloseable {
|
||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||
|
||||
Cookies getCookies();
|
||||
CookieStore getCookies();
|
||||
void clearCookies();
|
||||
|
||||
DomainProbeResult probeDomain(EdgeUrl url);
|
||||
|
||||
ContentTypeProbeResult probeContentType(
|
||||
EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags) throws HttpFetcherImpl.RateLimitException;
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws Exception;
|
||||
ProbeType probeType);
|
||||
|
||||
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||
|
||||
@@ -46,6 +44,7 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
|
||||
/** This domain redirects to another domain */
|
||||
record Redirect(EdgeDomain domain) implements DomainProbeResult {}
|
||||
record RedirectSameDomain_Internal(EdgeUrl domain) implements DomainProbeResult {}
|
||||
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
@@ -56,7 +55,10 @@ public interface HttpFetcher extends AutoCloseable {
|
||||
}
|
||||
|
||||
sealed interface ContentTypeProbeResult {
|
||||
record NoOp() implements ContentTypeProbeResult {}
|
||||
record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { }
|
||||
record HttpError(int statusCode, String message) implements ContentTypeProbeResult { }
|
||||
record Redirect(EdgeUrl location) implements ContentTypeProbeResult { }
|
||||
record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { }
|
||||
record Timeout(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||
record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { }
|
||||
|
@@ -5,79 +5,168 @@ import com.google.inject.Singleton;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.apache.hc.client5.http.ConnectionKeepAliveStrategy;
|
||||
import org.apache.hc.client5.http.HttpRequestRetryStrategy;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.config.ConnectionConfig;
|
||||
import org.apache.hc.client5.http.config.RequestConfig;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.client5.http.cookie.StandardCookieSpec;
|
||||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager;
|
||||
import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManagerBuilder;
|
||||
import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy;
|
||||
import org.apache.hc.core5.http.*;
|
||||
import org.apache.hc.core5.http.io.HttpClientResponseHandler;
|
||||
import org.apache.hc.core5.http.io.SocketConfig;
|
||||
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
||||
import org.apache.hc.core5.http.io.support.ClassicRequestBuilder;
|
||||
import org.apache.hc.core5.http.message.MessageSupport;
|
||||
import org.apache.hc.core5.http.protocol.HttpContext;
|
||||
import org.apache.hc.core5.pool.PoolStats;
|
||||
import org.apache.hc.core5.util.TimeValue;
|
||||
import org.apache.hc.core5.util.Timeout;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.SSLException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.http.HttpTimeoutException;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
|
||||
@Singleton
|
||||
public class HttpFetcherImpl implements HttpFetcher {
|
||||
public class HttpFetcherImpl implements HttpFetcher, HttpRequestRetryStrategy {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final String userAgentString;
|
||||
private final String userAgentIdentifier;
|
||||
private final Cookies cookies = new Cookies();
|
||||
|
||||
private final CookieStore cookies = new BasicCookieStore();
|
||||
|
||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
private final Marker crawlerAuditMarker = MarkerFactory.getMarker("CRAWLER");
|
||||
|
||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||
private final Duration probeTimeout = Duration.ofSeconds(30);
|
||||
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||
}
|
||||
|
||||
private final HttpClient client;
|
||||
private final CloseableHttpClient client;
|
||||
private PoolingHttpClientConnectionManager connectionManager;
|
||||
|
||||
private HttpClient createClient() {
|
||||
final ExecutorService executorService;
|
||||
public PoolStats getPoolStats() {
|
||||
return connectionManager.getTotalStats();
|
||||
}
|
||||
|
||||
if (Boolean.getBoolean("crawler.httpclient.useVirtualThreads")) {
|
||||
executorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||
}
|
||||
else {
|
||||
executorService = Executors.newCachedThreadPool();
|
||||
}
|
||||
private CloseableHttpClient createClient() throws NoSuchAlgorithmException {
|
||||
final ConnectionConfig connectionConfig = ConnectionConfig.custom()
|
||||
.setSocketTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectTimeout(30, TimeUnit.SECONDS)
|
||||
.setValidateAfterInactivity(TimeValue.ofSeconds(5))
|
||||
.build();
|
||||
|
||||
return HttpClient.newBuilder()
|
||||
.sslContext(NoSecuritySSL.buildSslContext())
|
||||
.cookieHandler(cookies)
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
.connectTimeout(Duration.ofSeconds(8))
|
||||
.executor(executorService)
|
||||
connectionManager = PoolingHttpClientConnectionManagerBuilder.create()
|
||||
.setMaxConnPerRoute(2)
|
||||
.setMaxConnTotal(5000)
|
||||
.setDefaultConnectionConfig(connectionConfig)
|
||||
.setTlsSocketStrategy(new DefaultClientTlsStrategy(SSLContext.getDefault()))
|
||||
.build();
|
||||
|
||||
connectionManager.setDefaultSocketConfig(SocketConfig.custom()
|
||||
.setSoLinger(TimeValue.ofSeconds(-1))
|
||||
.setSoTimeout(Timeout.ofSeconds(10))
|
||||
.build()
|
||||
);
|
||||
|
||||
Thread.ofPlatform().daemon(true).start(() -> {
|
||||
try {
|
||||
for (;;) {
|
||||
TimeUnit.SECONDS.sleep(15);
|
||||
logger.info("Connection pool stats: {}", connectionManager.getTotalStats());
|
||||
}
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
});
|
||||
|
||||
final RequestConfig defaultRequestConfig = RequestConfig.custom()
|
||||
.setCookieSpec(StandardCookieSpec.RELAXED)
|
||||
.setResponseTimeout(10, TimeUnit.SECONDS)
|
||||
.setConnectionRequestTimeout(5, TimeUnit.MINUTES)
|
||||
.build();
|
||||
|
||||
return HttpClients.custom()
|
||||
.setDefaultCookieStore(cookies)
|
||||
.setConnectionManager(connectionManager)
|
||||
.setRetryStrategy(this)
|
||||
.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() {
|
||||
// Default keep-alive duration is 3 minutes, but this is too long for us,
|
||||
// as we are either going to re-use it fairly quickly or close it for a long time.
|
||||
//
|
||||
// So we set it to 30 seconds or clamp the server-provided value to a minimum of 10 seconds.
|
||||
private static final TimeValue defaultValue = TimeValue.ofSeconds(30);
|
||||
|
||||
@Override
|
||||
public TimeValue getKeepAliveDuration(HttpResponse response, HttpContext context) {
|
||||
final Iterator<HeaderElement> it = MessageSupport.iterate(response, HeaderElements.KEEP_ALIVE);
|
||||
|
||||
while (it.hasNext()) {
|
||||
final HeaderElement he = it.next();
|
||||
final String param = he.getName();
|
||||
final String value = he.getValue();
|
||||
|
||||
if (value == null)
|
||||
continue;
|
||||
if (!"timeout".equalsIgnoreCase(param))
|
||||
continue;
|
||||
|
||||
try {
|
||||
long timeout = Long.parseLong(value);
|
||||
timeout = Math.clamp(timeout, 30, defaultValue.toSeconds());
|
||||
return TimeValue.ofSeconds(timeout);
|
||||
} catch (final NumberFormatException ignore) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
})
|
||||
.disableRedirectHandling()
|
||||
.setDefaultRequestConfig(defaultRequestConfig)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Cookies getCookies() {
|
||||
public CookieStore getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
@@ -89,19 +178,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@Inject
|
||||
public HttpFetcherImpl(UserAgent userAgent)
|
||||
{
|
||||
this.client = createClient();
|
||||
try {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
}
|
||||
|
||||
public HttpFetcherImpl(String userAgent) {
|
||||
this.client = createClient();
|
||||
try {
|
||||
this.client = createClient();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
}
|
||||
|
||||
// Not necessary in prod, but useful in test
|
||||
public void close() {
|
||||
public void close() throws IOException {
|
||||
client.close();
|
||||
}
|
||||
|
||||
@@ -114,34 +211,94 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
*/
|
||||
@Override
|
||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||
HttpRequest head;
|
||||
try {
|
||||
head = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(probeTimeout)
|
||||
.build();
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
List<EdgeUrl> urls = new ArrayList<>();
|
||||
urls.add(url);
|
||||
|
||||
for (int tries = 0;; tries++) {
|
||||
int redirects = 0;
|
||||
AtomicBoolean tryGet = new AtomicBoolean(false);
|
||||
|
||||
while (!urls.isEmpty() && ++redirects < 5) {
|
||||
ClassicHttpRequest request;
|
||||
|
||||
EdgeUrl topUrl = urls.removeFirst();
|
||||
try {
|
||||
var rsp = SendLock.wrapSend(client, head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||
if (tryGet.get()) {
|
||||
request = ClassicRequestBuilder.get(topUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Range", "bytes=0-255")
|
||||
.build();
|
||||
} else {
|
||||
request = ClassicRequestBuilder.head(topUrl.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
}
|
||||
return new DomainProbeResult.Ok(rspUri);
|
||||
} catch (Exception ex) {
|
||||
if (tries > 3) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
}
|
||||
// else try again ...
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
|
||||
try {
|
||||
var result = SendLock.wrapSend(client, request, response -> {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
|
||||
return switch (response.getCode()) {
|
||||
case 200 -> new DomainProbeResult.Ok(url);
|
||||
case 405 -> {
|
||||
if (!tryGet.get()) {
|
||||
tryGet.set(true);
|
||||
yield new DomainProbeResult.RedirectSameDomain_Internal(url);
|
||||
}
|
||||
else {
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status 405, tried HEAD and GET?!");
|
||||
}
|
||||
}
|
||||
case 301, 302, 307 -> {
|
||||
var location = response.getFirstHeader("Location");
|
||||
|
||||
if (location != null) {
|
||||
Optional<EdgeUrl> newUrl = linkParser.parseLink(topUrl, location.getValue());
|
||||
if (newUrl.isEmpty()) {
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid location header on redirect");
|
||||
}
|
||||
EdgeUrl newEdgeUrl = newUrl.get();
|
||||
if (newEdgeUrl.domain.equals(topUrl.domain)) {
|
||||
yield new DomainProbeResult.RedirectSameDomain_Internal(newEdgeUrl);
|
||||
}
|
||||
else {
|
||||
yield new DomainProbeResult.Redirect(newEdgeUrl.domain);
|
||||
}
|
||||
}
|
||||
|
||||
yield new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "No location header on redirect");
|
||||
|
||||
}
|
||||
default ->
|
||||
new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "HTTP status " + response.getCode());
|
||||
};
|
||||
});
|
||||
|
||||
if (result instanceof DomainProbeResult.RedirectSameDomain_Internal(EdgeUrl redirUrl)) {
|
||||
urls.add(redirUrl);
|
||||
}
|
||||
else {
|
||||
return result;
|
||||
}
|
||||
|
||||
// We don't have robots.txt yet, so we'll assume a request delay of 1 second
|
||||
TimeUnit.SECONDS.sleep(1);
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Failed to resolve domain root");
|
||||
|
||||
}
|
||||
|
||||
/** Perform a HEAD request to fetch the content type of a URL.
|
||||
@@ -152,70 +309,73 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
* recorded in the WARC file on failure.
|
||||
*/
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags tags) throws RateLimitException {
|
||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
|
||||
try {
|
||||
var headBuilder = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
var rsp = SendLock.wrapSend(client, headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||
var headers = rsp.headers();
|
||||
|
||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||
|
||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
||||
|
||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
||||
}
|
||||
|
||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||
|
||||
// HEAD 301 url1 -> url2
|
||||
// HEAD 200 url2
|
||||
// GET 301 url1 -> url2
|
||||
// GET 200 url2
|
||||
|
||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||
|
||||
var redirectUrl = new EdgeUrl(rsp.uri());
|
||||
EdgeUrl ret;
|
||||
|
||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||
else ret = url;
|
||||
|
||||
// Intercept rate limiting
|
||||
if (rsp.statusCode() == 429) {
|
||||
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
||||
}
|
||||
|
||||
return new ContentTypeProbeResult.Ok(ret);
|
||||
}
|
||||
catch (HttpTimeoutException ex) {
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
|
||||
return new ContentTypeProbeResult.Exception(ex);
|
||||
}
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags tags) {
|
||||
if (!tags.isEmpty() || !contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
return new ContentTypeProbeResult.NoOp();
|
||||
}
|
||||
|
||||
try {
|
||||
ClassicHttpRequest head = ClassicRequestBuilder.head(url.asURI())
|
||||
.addHeader("User-Agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.build();
|
||||
|
||||
cookies.paintRequest(head);
|
||||
|
||||
return SendLock.wrapSend(client, head, (rsp) -> {
|
||||
cookies.updateCookieStore(rsp);
|
||||
EntityUtils.consume(rsp.getEntity());
|
||||
int statusCode = rsp.getCode();
|
||||
|
||||
// Handle redirects
|
||||
if (statusCode == 301 || statusCode == 302 || statusCode == 307) {
|
||||
var location = rsp.getFirstHeader("Location");
|
||||
if (location != null) {
|
||||
Optional<EdgeUrl> newUrl = linkParser.parseLink(url, location.getValue());
|
||||
if (newUrl.isEmpty())
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Invalid location header on redirect");
|
||||
return new ContentTypeProbeResult.Redirect(newUrl.get());
|
||||
}
|
||||
}
|
||||
|
||||
if (statusCode == 405) {
|
||||
// If we get a 405, we can't probe the content type with HEAD, so we'll just say it's ok
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
// Handle errors
|
||||
if (statusCode < 200 || statusCode > 300) {
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Bad status code");
|
||||
}
|
||||
|
||||
// Handle missing content type
|
||||
var ctHeader = rsp.getFirstHeader("Content-Type");
|
||||
if (ctHeader == null) {
|
||||
return new ContentTypeProbeResult.HttpError(statusCode, "Missing Content-Type header");
|
||||
}
|
||||
var contentType = ctHeader.getValue();
|
||||
|
||||
// Check if the content type is allowed
|
||||
if (contentTypeLogic.isAllowableContentType(contentType)) {
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
} else {
|
||||
return new ContentTypeProbeResult.BadContentType(contentType, statusCode);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new ContentTypeProbeResult.Exception(ex);
|
||||
}
|
||||
finally {
|
||||
timer.waitFetchDelay();
|
||||
}
|
||||
return new ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
/** Fetch the content of a URL, and record it in a WARC file,
|
||||
@@ -225,38 +385,85 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
DomainCookies cookies,
|
||||
CrawlDelayTimer timer,
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
throws Exception
|
||||
{
|
||||
var getBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("User-Agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept-Language", "en,*;q=0.5")
|
||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
try {
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
try {
|
||||
var probeResult = probeContentType(url, cookies, timer, contentTags);
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.NoOp():
|
||||
break; //
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
logger.info(crawlerAuditMarker, "Probe result OK for {}", url);
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break;
|
||||
case ContentTypeProbeResult.BadContentType badContentType:
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode());
|
||||
logger.info(crawlerAuditMarker, "Probe result Bad ContenType ({}) for {}", badContentType.contentType(), url);
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case ContentTypeProbeResult.BadContentType.Timeout(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Timeout for {}", url);
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.Exception(Exception ex):
|
||||
logger.info(crawlerAuditMarker, "Probe result Exception({}) for {}", ex.getClass().getSimpleName(), url);
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
case ContentTypeProbeResult.HttpError httpError:
|
||||
logger.info(crawlerAuditMarker, "Probe result HTTP Error ({}) for {}", httpError.statusCode(), url);
|
||||
return new HttpFetchResult.ResultException(new HttpException("HTTP status code " + httpError.statusCode() + ": " + httpError.message()));
|
||||
case ContentTypeProbeResult.Redirect redirect:
|
||||
logger.info(crawlerAuditMarker, "Probe result redirect for {} -> {}", url, redirect.location());
|
||||
return new HttpFetchResult.ResultRedirect(redirect.location());
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 429) {
|
||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||
}
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
if (ok.statusCode() == 200) {
|
||||
return ok;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept-Language", "en,*;q=0.5");
|
||||
request.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8");
|
||||
|
||||
contentTags.paint(request);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
Instant start = Instant.now();
|
||||
HttpFetchResult result = warcRecorder.fetch(client, cookies, request);
|
||||
|
||||
Duration fetchDuration = Duration.between(start, Instant.now());
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 304) {
|
||||
result = new HttpFetchResult.Result304Raw();
|
||||
}
|
||||
}
|
||||
|
||||
switch (result) {
|
||||
case HttpFetchResult.ResultOk ok -> logger.info(crawlerAuditMarker, "Fetch result OK {} for {} ({} ms)", ok.statusCode(), url, fetchDuration.toMillis());
|
||||
case HttpFetchResult.ResultRedirect redirect -> logger.info(crawlerAuditMarker, "Fetch result redirect: {} for {}", redirect.url(), url);
|
||||
case HttpFetchResult.ResultNone none -> logger.info(crawlerAuditMarker, "Fetch result none for {}", url);
|
||||
case HttpFetchResult.ResultException ex -> logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex.ex());
|
||||
case HttpFetchResult.Result304Raw raw -> logger.info(crawlerAuditMarker, "Fetch result: 304 Raw for {}", url);
|
||||
case HttpFetchResult.Result304ReplacedWithReference ref -> logger.info(crawlerAuditMarker, "Fetch result: 304 With reference for {}", url);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error(crawlerAuditMarker, "Fetch result exception for {}", url, ex);
|
||||
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -322,68 +529,66 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException {
|
||||
HttpGet getRequest = new HttpGet(sitemapUrl.asURI());
|
||||
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
getRequest.addHeader("Accept-Encoding", "gzip");
|
||||
getRequest.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
getRequest.addHeader("User-Agent", userAgentString);
|
||||
|
||||
try (var sl = new SendLock()) {
|
||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||
if (response.statusCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
Document parsedSitemap;
|
||||
|
||||
try (InputStream inputStream = response.body()) {
|
||||
InputStream parserStream;
|
||||
if (sitemapUrl.path.endsWith(".gz")) {
|
||||
parserStream = new GZIPInputStream(inputStream);
|
||||
} else {
|
||||
parserStream = inputStream;
|
||||
}
|
||||
|
||||
parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||
}
|
||||
finally {
|
||||
sl.close();
|
||||
}
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
return client.execute(getRequest, response -> {
|
||||
try {
|
||||
if (response.getCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(
|
||||
EntityUtils.toString(response.getEntity()),
|
||||
sitemapUrl.toString(),
|
||||
Parser.xmlParser()
|
||||
);
|
||||
|
||||
if (parsedSitemap.childrenSize() == 0) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
finally {
|
||||
EntityUtils.consume(response.getEntity());
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
});
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error while fetching sitemap {}: {} ({})", sitemapUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -408,15 +613,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try (var sl = new SendLock()) {
|
||||
var getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-Agent", userAgentString)
|
||||
.timeout(requestTimeout);
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||
HttpGet request = new HttpGet(url.asURI());
|
||||
request.addHeader("User-Agent", userAgentString);
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
request.addHeader("Accept", "text/*, */*;q=0.9");
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, new DomainCookies(), request);
|
||||
|
||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||
robotsParser.parseContent(url.toString(),
|
||||
@@ -430,6 +633,59 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
if (exception instanceof SocketTimeoutException) { // Timeouts are not recoverable
|
||||
return false;
|
||||
}
|
||||
if (exception instanceof SSLException) { // SSL exceptions are unlikely to be recoverable
|
||||
return false;
|
||||
}
|
||||
|
||||
return executionCount <= 3;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retryRequest(HttpResponse response, int executionCount, HttpContext context) {
|
||||
return switch (response.getCode()) {
|
||||
case 500, 503 -> executionCount <= 2;
|
||||
case 429 -> executionCount <= 3;
|
||||
default -> false;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpRequest request, IOException exception, int executionCount, HttpContext context) {
|
||||
return TimeValue.ofSeconds(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue getRetryInterval(HttpResponse response, int executionCount, HttpContext context) {
|
||||
|
||||
int statusCode = response.getCode();
|
||||
|
||||
// Give 503 a bit more time
|
||||
if (statusCode == 503) return TimeValue.ofSeconds(5);
|
||||
|
||||
if (statusCode == 429) {
|
||||
// get the Retry-After header
|
||||
String retryAfter = response.getFirstHeader("Retry-After").getValue();
|
||||
if (retryAfter == null) {
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
try {
|
||||
int retryAfterTime = Integer.parseInt(retryAfter);
|
||||
retryAfterTime = Math.clamp(retryAfterTime, 1, 5);
|
||||
|
||||
return TimeValue.ofSeconds(retryAfterTime);
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("Invalid Retry-After header: {}", retryAfter);
|
||||
}
|
||||
}
|
||||
|
||||
return TimeValue.ofSeconds(2);
|
||||
}
|
||||
|
||||
public static class RateLimitException extends Exception {
|
||||
private final String retryAfter;
|
||||
@@ -462,9 +718,10 @@ class SendLock implements AutoCloseable {
|
||||
maxConcurrentRequests.acquireUninterruptibly();
|
||||
}
|
||||
|
||||
public static <T> HttpResponse<T> wrapSend(HttpClient client, HttpRequest request, HttpResponse.BodyHandler<T> handler) throws IOException, InterruptedException {
|
||||
public static <T> T wrapSend(HttpClient client, final ClassicHttpRequest request,
|
||||
final HttpClientResponseHandler<? extends T> responseHandler) throws IOException {
|
||||
try (var lock = new SendLock()) {
|
||||
return client.send(request, handler);
|
||||
return client.execute(request, responseHandler);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,18 +1,20 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static nu.marginalia.crawl.fetcher.warc.ErrorBuffer.suppressContentEncoding;
|
||||
|
||||
/** Input buffer for temporary storage of a HTTP response
|
||||
* This may be in-memory or on-disk, at the discretion of
|
||||
@@ -20,9 +22,9 @@ import java.util.zip.GZIPInputStream;
|
||||
* */
|
||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
protected HttpHeaders headers;
|
||||
protected Header[] headers;
|
||||
|
||||
WarcInputBuffer(HttpHeaders headers) {
|
||||
WarcInputBuffer(Header[] headers) {
|
||||
this.headers = headers;
|
||||
}
|
||||
|
||||
@@ -34,7 +36,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
|
||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||
|
||||
public final HttpHeaders headers() { return headers; }
|
||||
public final Header[] headers() { return headers; }
|
||||
|
||||
/** Create a buffer for a response.
|
||||
* If the response is small and not compressed, it will be stored in memory.
|
||||
@@ -42,39 +44,50 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
* and suppressed from the headers.
|
||||
* If an error occurs, a buffer will be created with no content and an error status.
|
||||
*/
|
||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp, Duration timeLimit) {
|
||||
if (rsp == null)
|
||||
static WarcInputBuffer forResponse(ClassicHttpResponse response,
|
||||
HttpGet request,
|
||||
Duration timeLimit) throws IOException {
|
||||
if (response == null)
|
||||
return new ErrorBuffer();
|
||||
|
||||
var headers = rsp.headers();
|
||||
|
||||
try (var is = rsp.body()) {
|
||||
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
||||
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
||||
var entity = response.getEntity();
|
||||
|
||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||
if (null == entity) {
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
InputStream is = null;
|
||||
try {
|
||||
is = entity.getContent();
|
||||
long length = entity.getContentLength();
|
||||
|
||||
if (length > 0 && length < 8192) {
|
||||
// If the content is small and not compressed, we can just read it into memory
|
||||
return new MemoryBuffer(headers, timeLimit, is, contentLength);
|
||||
}
|
||||
else {
|
||||
return new MemoryBuffer(response.getHeaders(), request, timeLimit, is, (int) length);
|
||||
} else {
|
||||
// Otherwise, we unpack it into a file and read it from there
|
||||
return new FileBuffer(headers, timeLimit, is);
|
||||
return new FileBuffer(response.getHeaders(), request, timeLimit, is);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new ErrorBuffer();
|
||||
finally {
|
||||
try {
|
||||
is.skip(Long.MAX_VALUE);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Ignore the exception
|
||||
}
|
||||
finally {
|
||||
// Close the input stream
|
||||
IOUtils.closeQuietly(is);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final ExecutorService virtualExecutorService = Executors.newVirtualThreadPerTaskExecutor();
|
||||
|
||||
private Future<Integer> readAsync(InputStream is, byte[] out) {
|
||||
return virtualExecutorService.submit(() -> is.read(out));
|
||||
}
|
||||
|
||||
/** Copy an input stream to an output stream, with a maximum size and time limit */
|
||||
protected void copy(InputStream is, OutputStream os, Duration timeLimit) {
|
||||
protected void copy(InputStream is, HttpGet request, OutputStream os, Duration timeLimit) {
|
||||
Instant start = Instant.now();
|
||||
Instant timeout = start.plus(timeLimit);
|
||||
long size = 0;
|
||||
@@ -89,28 +102,102 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
Duration remaining = Duration.between(Instant.now(), timeout);
|
||||
if (remaining.isNegative()) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
// Abort the request if the time limit is exceeded
|
||||
// so we don't keep the connection open forever or are forced to consume
|
||||
// the stream to the end
|
||||
|
||||
request.abort();
|
||||
break;
|
||||
}
|
||||
|
||||
Future<Integer> readAsync = readAsync(is, buffer);
|
||||
int n = readAsync.get(remaining.toMillis(), TimeUnit.MILLISECONDS);
|
||||
int n = is.read(buffer);
|
||||
|
||||
if (n < 0) break;
|
||||
size += n;
|
||||
os.write(buffer, 0, n);
|
||||
|
||||
if (size > WarcRecorder.MAX_SIZE) {
|
||||
// Even if we've exceeded the max length,
|
||||
// we keep consuming the stream up until the end or a timeout,
|
||||
// as closing the stream means resetting the connection, and
|
||||
// that's generally not desirable.
|
||||
|
||||
if (size < WarcRecorder.MAX_SIZE) {
|
||||
os.write(buffer, 0, n);
|
||||
}
|
||||
else if (truncationReason != WarcTruncationReason.LENGTH) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
break;
|
||||
}
|
||||
} catch (IOException|ExecutionException e) {
|
||||
|
||||
} catch (IOException e) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
} catch (TimeoutException e) {
|
||||
truncationReason = WarcTruncationReason.TIME;
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Takes a Content-Range header and checks if it is complete.
|
||||
* A complete range is one that covers the entire resource.
|
||||
* For example, "bytes 0-1023/2048" or "bytes 0-1023/*" are complete ranges.
|
||||
* "bytes 0-1023/2048" is not a complete range.
|
||||
*/
|
||||
public boolean isRangeComplete(Header[] headers) {
|
||||
// Find the Content-Range header
|
||||
String contentRangeHeader = null;
|
||||
for (var header : headers) {
|
||||
if ("Content-Range".equalsIgnoreCase(header.getName())) {
|
||||
contentRangeHeader = header.getValue();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if header is null or empty
|
||||
if (contentRangeHeader == null || contentRangeHeader.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
// Content-Range format: "bytes range-start-range-end/size"
|
||||
// e.g., "bytes 0-1023/2048" or "bytes 0-1023/*"
|
||||
|
||||
// Get the part after "bytes "
|
||||
String[] parts = contentRangeHeader.split(" ", 2);
|
||||
if (parts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the range and size parts (e.g., "0-1023/2048")
|
||||
String rangeAndSize = parts[1];
|
||||
String[] rangeAndSizeParts = rangeAndSize.split("/", 2);
|
||||
if (rangeAndSizeParts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the range (e.g., "0-1023")
|
||||
String range = rangeAndSizeParts[0];
|
||||
String[] rangeParts = range.split("-", 2);
|
||||
if (rangeParts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the size (e.g., "2048" or "*")
|
||||
String size = rangeAndSizeParts[1];
|
||||
|
||||
// If size is "*", we don't know the total size, so return false
|
||||
if ("*".equals(size)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Parse as long to handle large files
|
||||
long rangeStart = Long.parseLong(rangeParts[0]);
|
||||
long rangeEnd = Long.parseLong(rangeParts[1]);
|
||||
long totalSize = Long.parseLong(size);
|
||||
|
||||
// Check if the range covers the entire resource
|
||||
return rangeStart == 0 && rangeEnd == totalSize - 1;
|
||||
|
||||
} catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -118,7 +205,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
/** Pseudo-buffer for when we have an error */
|
||||
class ErrorBuffer extends WarcInputBuffer {
|
||||
public ErrorBuffer() {
|
||||
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
||||
super(new Header[0]);
|
||||
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
@@ -135,17 +222,29 @@ class ErrorBuffer extends WarcInputBuffer {
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {}
|
||||
|
||||
|
||||
static Header[] suppressContentEncoding(Header[] headers) {
|
||||
return Arrays.stream(headers).filter(header -> !"Content-Encoding".equalsIgnoreCase(header.getName())).toArray(Header[]::new);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Buffer for when we have the response in memory */
|
||||
class MemoryBuffer extends WarcInputBuffer {
|
||||
byte[] data;
|
||||
public MemoryBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream, int size) {
|
||||
super(headers);
|
||||
public MemoryBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream, int size) {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
} else {
|
||||
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
}
|
||||
|
||||
var outputStream = new ByteArrayOutputStream(size);
|
||||
|
||||
copy(responseStream, outputStream, timeLimit);
|
||||
copy(responseStream, request, outputStream, timeLimit);
|
||||
|
||||
data = outputStream.toByteArray();
|
||||
}
|
||||
@@ -169,40 +268,25 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
class FileBuffer extends WarcInputBuffer {
|
||||
private final Path tempFile;
|
||||
|
||||
public FileBuffer(HttpHeaders headers, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
public FileBuffer(Header[] headers, HttpGet request, Duration timeLimit, InputStream responseStream) throws IOException {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
if (!isRangeComplete(headers)) {
|
||||
truncationReason = WarcTruncationReason.LENGTH;
|
||||
} else {
|
||||
truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
}
|
||||
|
||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||
|
||||
|
||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(new GZIPInputStream(responseStream), out, timeLimit);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, request, out, timeLimit);
|
||||
}
|
||||
else {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(responseStream, out, timeLimit);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
}
|
||||
|
||||
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
||||
return HttpHeaders.of(headers.map(), (k, v) -> {
|
||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
||||
return false;
|
||||
}
|
||||
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public InputStream read() throws IOException {
|
||||
return Files.newInputStream(tempFile);
|
||||
}
|
||||
|
@@ -1,6 +1,8 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hc.core5.http.ClassicHttpResponse;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
@@ -17,7 +19,7 @@ import java.util.stream.Collectors;
|
||||
public class WarcProtocolReconstructor {
|
||||
|
||||
static String getHttpRequestString(String method,
|
||||
Map<String, List<String>> mainHeaders,
|
||||
Header[] mainHeaders,
|
||||
Map<String, List<String>> extraHeaders,
|
||||
URI uri) {
|
||||
StringBuilder requestStringBuilder = new StringBuilder();
|
||||
@@ -34,12 +36,13 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
Set<String> addedHeaders = new HashSet<>();
|
||||
|
||||
mainHeaders.forEach((k, values) -> {
|
||||
for (var value : values) {
|
||||
addedHeaders.add(k);
|
||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n");
|
||||
}
|
||||
});
|
||||
for (var header : mainHeaders) {
|
||||
String k = header.getName();
|
||||
String v = header.getValue();
|
||||
|
||||
addedHeaders.add(k);
|
||||
requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(v).append("\r\n");
|
||||
}
|
||||
|
||||
extraHeaders.forEach((k, values) -> {
|
||||
if (!addedHeaders.contains(k)) {
|
||||
@@ -87,6 +90,12 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
||||
Map.entry(200, "OK"),
|
||||
Map.entry(201, "Created"),
|
||||
@@ -149,6 +158,37 @@ public class WarcProtocolReconstructor {
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
String headerCapitalized = capitalizeHeader(header.getName());
|
||||
|
||||
// Omit pseudoheaders injected by the crawler itself
|
||||
if (headerCapitalized.startsWith("X-Marginalia"))
|
||||
continue;
|
||||
|
||||
// Omit Transfer-Encoding and Content-Encoding headers
|
||||
if (headerCapitalized.equals("Transfer-Encoding"))
|
||||
continue;
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
continue;
|
||||
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
|
@@ -1,11 +1,16 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.core5.http.NameValuePair;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
import org.slf4j.Logger;
|
||||
@@ -14,10 +19,9 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@@ -37,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
static final int MAX_TIME = 30_000;
|
||||
|
||||
/** Maximum (decompressed) size we'll save */
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
|
||||
|
||||
private final WarcWriter writer;
|
||||
private final Path warcFile;
|
||||
@@ -48,22 +52,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Affix a version string in case we need to change the format in the future
|
||||
// in some way
|
||||
private final String warcRecorderVersion = "1.0";
|
||||
private final Cookies cookies;
|
||||
private final LinkParser linkParser = new LinkParser();
|
||||
/**
|
||||
* Create a new WarcRecorder that will write to the given file
|
||||
*
|
||||
* @param warcFile The file to write to
|
||||
*/
|
||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||
public WarcRecorder(Path warcFile) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = fetcher.getCookies();
|
||||
}
|
||||
|
||||
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -73,16 +70,25 @@ public class WarcRecorder implements AutoCloseable {
|
||||
public WarcRecorder() throws IOException {
|
||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||
this.writer = new WarcWriter(this.warcFile);
|
||||
this.cookies = new Cookies();
|
||||
|
||||
temporaryFile = true;
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
java.net.http.HttpRequest request)
|
||||
DomainCookies cookies,
|
||||
HttpGet request)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
URI requestUri = request.uri();
|
||||
return fetch(client, cookies, request, Duration.ofMillis(MAX_TIME));
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
DomainCookies cookies,
|
||||
HttpGet request,
|
||||
Duration timeout)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
URI requestUri = request.getUri();
|
||||
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -90,121 +96,151 @@ public class WarcRecorder implements AutoCloseable {
|
||||
Instant date = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.headers().map());
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
|
||||
HttpResponse<InputStream> response;
|
||||
// Inject a range header to attempt to limit the size of the response
|
||||
// to the maximum size we want to store, if the server supports it.
|
||||
request.addHeader("Range", "bytes=0-"+MAX_SIZE);
|
||||
cookies.paintRequest(request);
|
||||
try {
|
||||
response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return client.execute(request,response -> {
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
byte[] httpRequestString = WarcProtocolReconstructor
|
||||
.getHttpRequestString(
|
||||
request.getMethod(),
|
||||
request.getHeaders(),
|
||||
extraHeaders,
|
||||
requestUri)
|
||||
.getBytes();
|
||||
|
||||
requestDigestBuilder.update(httpRequestString);
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
responseDataBuffer.put(responseHeaders);
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||
|
||||
int dataStart = responseDataBuffer.pos();
|
||||
|
||||
for (;;) {
|
||||
int remainingLength = responseDataBuffer.remaining();
|
||||
if (remainingLength == 0)
|
||||
break;
|
||||
|
||||
int startPos = responseDataBuffer.pos();
|
||||
|
||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||
if (n < 0)
|
||||
break;
|
||||
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||
}
|
||||
|
||||
// with some http client libraries, that resolve redirects transparently, this might be different
|
||||
// from the request URI, but currently we don't have transparent redirect resolution so it's always
|
||||
// the same (though let's keep the variables separate in case this changes)
|
||||
final URI responseUri = requestUri;
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||
responseBuilder.ipAddress(inetAddress);
|
||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||
|
||||
// Build and write the response
|
||||
|
||||
var warcResponse = responseBuilder.build();
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
if (response.getCode() == 301 || response.getCode() == 302 || response.getCode() == 307) {
|
||||
// If the server responds with a redirect, we need to
|
||||
// update the request URI to the new location
|
||||
EdgeUrl redirectLocation = Optional.ofNullable(response.getFirstHeader("Location"))
|
||||
.map(NameValuePair::getValue)
|
||||
.flatMap(location -> linkParser.parseLink(new EdgeUrl(requestUri), location))
|
||||
.orElse(null);
|
||||
if (redirectLocation != null) {
|
||||
// If the redirect location is a valid URL, we need to update the request URI
|
||||
return new HttpFetchResult.ResultRedirect(redirectLocation);
|
||||
} else {
|
||||
// If the redirect location is not a valid URL, we need to throw an exception
|
||||
return new HttpFetchResult.ResultException(new IOException("Invalid redirect location: " + response.getFirstHeader("Location")));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.getCode(),
|
||||
inputBuffer.headers(),
|
||||
inetAddress.getHostAddress(),
|
||||
responseDataBuffer.data,
|
||||
dataStart,
|
||||
responseDataBuffer.length() - dataStart);
|
||||
} catch (Exception ex) {
|
||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
});
|
||||
// the client.execute() method will throw an exception if the request times out
|
||||
// or on other IO exceptions, so we need to catch those here as well as having
|
||||
// exception handling in the response handler
|
||||
} catch (SocketTimeoutException ex) {
|
||||
flagAsTimeout(new EdgeUrl(requestUri)); // write a WARC record to indicate the timeout
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request.timeout().orElseGet(() -> Duration.ofMillis(MAX_TIME)));
|
||||
InputStream inputStream = inputBuffer.read())
|
||||
{
|
||||
if (cookies.hasCookies()) {
|
||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
responseDataBuffer.put(responseHeaders);
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||
|
||||
int dataStart = responseDataBuffer.pos();
|
||||
|
||||
for (;;) {
|
||||
int remainingLength = responseDataBuffer.remaining();
|
||||
if (remainingLength == 0)
|
||||
break;
|
||||
|
||||
int startPos = responseDataBuffer.pos();
|
||||
|
||||
int n = responseDataBuffer.readFrom(inputStream, remainingLength);
|
||||
if (n < 0)
|
||||
break;
|
||||
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n);
|
||||
responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n);
|
||||
}
|
||||
|
||||
// It looks like this might be the same as requestUri, but it's not;
|
||||
// it's the URI after resolving redirects.
|
||||
final URI responseUri = response.uri();
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||
responseBuilder.ipAddress(inetAddress);
|
||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||
|
||||
// Build and write the response
|
||||
|
||||
var warcResponse = responseBuilder.build();
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
// Build and write the request
|
||||
|
||||
WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
byte[] httpRequestString = WarcProtocolReconstructor
|
||||
.getHttpRequestString(
|
||||
response.request().method(),
|
||||
response.request().headers().map(),
|
||||
extraHeaders,
|
||||
requestUri)
|
||||
.getBytes();
|
||||
|
||||
requestDigestBuilder.update(httpRequestString);
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.concurrentTo(warcResponse.id())
|
||||
.build();
|
||||
|
||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcRequest);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
|
||||
// Note we bail *after* writing the warc records, this will effectively only
|
||||
// prevent link extraction from the document.
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.statusCode(),
|
||||
inputBuffer.headers(),
|
||||
inetAddress.getHostAddress(),
|
||||
responseDataBuffer.data,
|
||||
dataStart,
|
||||
responseDataBuffer.length() - dataStart);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (IOException ex) {
|
||||
flagAsError(new EdgeUrl(requestUri), ex); // write a WARC record to indicate the error
|
||||
logger.warn("Failed to fetch URL {}: {}", requestUri, ex.getMessage());
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
@@ -214,7 +250,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
private void saveOldResponse(EdgeUrl url, DomainCookies domainCookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@@ -275,7 +311,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
.date(Instant.now())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
if (cookies.hasCookies()) {
|
||||
if (domainCookies.hasCookies() || (headers != null && headers.contains("Set-Cookie:"))) {
|
||||
builder.addHeader("X-Has-Cookies", "1");
|
||||
}
|
||||
|
||||
@@ -295,8 +331,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||
public void writeReferenceCopy(EdgeUrl url, DomainCookies cookies, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, cookies, contentType, statusCode, documentBody, headers, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.DomainProbeResult result) throws IOException {
|
||||
@@ -316,6 +352,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
case HttpFetcherImpl.DomainProbeResult.Ok ok:
|
||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||
break;
|
||||
case HttpFetcher.DomainProbeResult.RedirectSameDomain_Internal redirectSameDomain:
|
||||
fields.put("X-WARC-Probe-Status", List.of("REDIR-INTERNAL"));
|
||||
break;
|
||||
}
|
||||
|
||||
var warcinfo = new Warcinfo.Builder()
|
||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.logic;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
@@ -19,8 +20,22 @@ public class DomainLocks {
|
||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||
*/
|
||||
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||
return new DomainLock(domain.toString(),
|
||||
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
|
||||
sem.acquire();
|
||||
|
||||
return new DomainLock(sem);
|
||||
}
|
||||
|
||||
public Optional<DomainLock> tryLockDomain(EdgeDomain domain) {
|
||||
var sem = locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
||||
if (sem.tryAcquire(1)) {
|
||||
return Optional.of(new DomainLock(sem));
|
||||
}
|
||||
else {
|
||||
// We don't have a lock, so we return an empty optional
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private Semaphore defaultPermits(String topDomain) {
|
||||
@@ -28,23 +43,27 @@ public class DomainLocks {
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
/** Returns true if the domain is lockable, i.e. if it is not already locked by another thread.
|
||||
* (this is just a hint, and does not guarantee that the domain is actually lockable any time
|
||||
* after this method returns true)
|
||||
*/
|
||||
public boolean isLockableHint(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
@@ -53,22 +72,16 @@ public class DomainLocks {
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
||||
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||
this.domainName = domainName;
|
||||
DomainLock(Semaphore semaphore) {
|
||||
this.semaphore = semaphore;
|
||||
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
|
||||
semaphore.acquire();
|
||||
Thread.currentThread().setName("crawling:" + domainName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
semaphore.release();
|
||||
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
|
||||
Thread.currentThread().setName("[idle]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -51,6 +51,10 @@ public class CrawlDelayTimer {
|
||||
waitFetchDelay(0);
|
||||
}
|
||||
|
||||
public void waitFetchDelay(Duration spentTime) {
|
||||
waitFetchDelay(spentTime.toMillis());
|
||||
}
|
||||
|
||||
public void waitFetchDelay(long spentTime) {
|
||||
long sleepTime = delayTime;
|
||||
|
||||
|
@@ -0,0 +1,42 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* This class is used to stagger the rate at which connections are created.
|
||||
* <p></p>
|
||||
* It is used to ensure that we do not create too many connections at once,
|
||||
* which can lead to network congestion and other issues. Since the connections
|
||||
* tend to be very long-lived, we can afford to wait a bit before creating the next
|
||||
* even if it adds a bit of build-up time when the crawl starts.
|
||||
*/
|
||||
public class CrawlerConnectionThrottle {
|
||||
private Instant lastCrawlStart = Instant.EPOCH;
|
||||
private final Semaphore launchSemaphore = new Semaphore(1);
|
||||
|
||||
private final Duration launchInterval;
|
||||
|
||||
public CrawlerConnectionThrottle(Duration launchInterval) {
|
||||
this.launchInterval = launchInterval;
|
||||
}
|
||||
|
||||
public void waitForConnectionPermission() throws InterruptedException {
|
||||
try {
|
||||
launchSemaphore.acquire();
|
||||
Instant nextPermittedLaunch = lastCrawlStart.plus(launchInterval);
|
||||
|
||||
if (nextPermittedLaunch.isAfter(Instant.now())) {
|
||||
long waitTime = Duration.between(Instant.now(), nextPermittedLaunch).toMillis();
|
||||
TimeUnit.MILLISECONDS.sleep(waitTime);
|
||||
}
|
||||
|
||||
lastCrawlStart = Instant.now();
|
||||
}
|
||||
finally {
|
||||
launchSemaphore.release();
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,8 +6,8 @@ import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
@@ -29,13 +29,13 @@ import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
private static final int MAX_ERRORS = 20;
|
||||
private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once
|
||||
|
||||
private final HttpFetcher fetcher;
|
||||
|
||||
@@ -52,6 +52,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private final DomainStateDb domainStateDb;
|
||||
private final WarcRecorder warcRecorder;
|
||||
private final CrawlerRevisitor crawlerRevisitor;
|
||||
private final DomainCookies cookies = new DomainCookies();
|
||||
|
||||
private static final CrawlerConnectionThrottle connectionThrottle = new CrawlerConnectionThrottle(
|
||||
Duration.ofSeconds(1) // pace the connections to avoid network congestion at startup
|
||||
);
|
||||
|
||||
int errorCount = 0;
|
||||
|
||||
@@ -92,6 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
try (oldCrawlData) {
|
||||
|
||||
// Wait for permission to open a connection to avoid network congestion
|
||||
// from hundreds/thousands of TCP handshakes
|
||||
connectionThrottle.waitForConnectionPermission();
|
||||
|
||||
// Do an initial domain probe to determine the root URL
|
||||
var probeResult = probeRootUrl();
|
||||
|
||||
@@ -116,7 +126,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
Instant recrawlStart = Instant.now();
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
CrawlerRevisitor.RecrawlMetadata recrawlMetadata = crawlerRevisitor.recrawl(oldCrawlData, cookies, robotsRules, delayTimer);
|
||||
Duration recrawlTime = Duration.between(recrawlStart, Instant.now());
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
@@ -137,6 +147,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||
yield 1;
|
||||
}
|
||||
default -> {
|
||||
logger.error("Unexpected domain probe result {}", probeResult);
|
||||
yield 1;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
@@ -254,17 +268,29 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return domainProbeResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
Optional<String> feedLink = Optional.empty();
|
||||
|
||||
try {
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
HttpFetchResult result = fetcher.fetchContent(url, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||
if (result instanceof HttpFetchResult.ResultRedirect(EdgeUrl location)) {
|
||||
if (Objects.equals(location.domain, url.domain)) {
|
||||
// TODO: Follow the redirect to the new location and sniff the document
|
||||
crawlFrontier.addFirst(location);
|
||||
}
|
||||
|
||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||
}
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||
}
|
||||
|
||||
var optDoc = ok.parseDocument();
|
||||
if (optDoc.isEmpty())
|
||||
@@ -313,7 +339,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
// Grab the favicon if it exists
|
||||
|
||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
if (fetcher.fetchContent(faviconUrl, warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
@@ -383,7 +409,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (parsedOpt.isEmpty())
|
||||
return false;
|
||||
|
||||
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
HttpFetchResult result = fetcher.fetchContent(parsedOpt.get(), warcRecorder, cookies, timer, ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||
@@ -409,112 +435,63 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
CrawlDelayTimer timer,
|
||||
DocumentWithReference reference) throws InterruptedException
|
||||
{
|
||||
logger.debug("Fetching {}", top);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
var contentTags = reference.getContentTags();
|
||||
|
||||
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
||||
HttpFetchResult fetchedDoc = fetcher.fetchContent(top, warcRecorder, cookies, timer, contentTags, HttpFetcher.ProbeType.FULL);
|
||||
timer.waitFetchDelay();
|
||||
|
||||
if (Thread.interrupted()) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new InterruptedException();
|
||||
}
|
||||
|
||||
// Parse the document and enqueue links
|
||||
try {
|
||||
if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) {
|
||||
var docOpt = ok.parseDocument();
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
switch (fetchedDoc) {
|
||||
case HttpFetchResult.ResultOk ok -> {
|
||||
var docOpt = ok.parseDocument();
|
||||
if (docOpt.isPresent()) {
|
||||
var doc = docOpt.get();
|
||||
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
var responseUrl = new EdgeUrl(ok.uri());
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
crawlFrontier.enqueueLinksFromDocument(responseUrl, doc);
|
||||
crawlFrontier.addVisited(responseUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
var doc = reference.doc();
|
||||
case HttpFetchResult.Result304Raw ref when reference.doc() != null ->
|
||||
{
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
warcRecorder.writeReferenceCopy(top, cookies, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
doc.documentBodyBytes);
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
doc.documentBodyBytes);
|
||||
|
||||
if (doc.documentBodyBytes != null) {
|
||||
var parsed = doc.parseBody();
|
||||
if (doc.documentBodyBytes != null) {
|
||||
var parsed = doc.parseBody();
|
||||
|
||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||
crawlFrontier.addVisited(top);
|
||||
crawlFrontier.enqueueLinksFromDocument(top, parsed);
|
||||
crawlFrontier.addVisited(top);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (fetchedDoc instanceof HttpFetchResult.ResultException) {
|
||||
errorCount ++;
|
||||
case HttpFetchResult.ResultRedirect(EdgeUrl location) -> {
|
||||
if (Objects.equals(location.domain, top.domain)) {
|
||||
crawlFrontier.addFirst(location);
|
||||
}
|
||||
}
|
||||
case HttpFetchResult.ResultException ex -> errorCount++;
|
||||
default -> {} // Ignore other types
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error parsing document {}", top, ex);
|
||||
}
|
||||
|
||||
timer.waitFetchDelay(System.currentTimeMillis() - startTime);
|
||||
|
||||
return fetchedDoc;
|
||||
}
|
||||
|
||||
/** Fetch a document and retry on 429s */
|
||||
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
||||
CrawlDelayTimer timer,
|
||||
HttpFetcher.ProbeType probeType,
|
||||
ContentTags contentTags) throws InterruptedException {
|
||||
|
||||
long probeStart = System.currentTimeMillis();
|
||||
|
||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||
retryLoop:
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
||||
|
||||
switch (probeResult) {
|
||||
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||
break retryLoop;
|
||||
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
||||
return new HttpFetchResult.ResultNone();
|
||||
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
||||
return new HttpFetchResult.ResultException(timeout.ex());
|
||||
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
||||
return new HttpFetchResult.ResultException(exception.ex());
|
||||
default: // should be unreachable
|
||||
throw new IllegalStateException("Unknown probe result");
|
||||
}
|
||||
}
|
||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||
try {
|
||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||
}
|
||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to fetch {}", url, ex);
|
||||
return new HttpFetchResult.ResultException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultNone();
|
||||
}
|
||||
|
||||
private boolean isAllowedProtocol(String proto) {
|
||||
return proto.equalsIgnoreCase("http")
|
||||
|| proto.equalsIgnoreCase("https");
|
||||
|
@@ -55,6 +55,9 @@ public class DomainCrawlFrontier {
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return thisDomain;
|
||||
}
|
||||
/** Increase the depth of the crawl by a factor. If the current depth is smaller
|
||||
* than the number of already visited documents, the base depth will be adjusted
|
||||
* to the visited count first.
|
||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
@@ -10,6 +11,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@@ -18,10 +21,13 @@ import java.io.IOException;
|
||||
* E-Tag and Last-Modified headers.
|
||||
*/
|
||||
public class CrawlerRevisitor {
|
||||
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
private final CrawlerRetreiver crawlerRetreiver;
|
||||
private final WarcRecorder warcRecorder;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRevisitor.class);
|
||||
|
||||
public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier,
|
||||
CrawlerRetreiver crawlerRetreiver,
|
||||
WarcRecorder warcRecorder) {
|
||||
@@ -32,6 +38,7 @@ public class CrawlerRevisitor {
|
||||
|
||||
/** Performs a re-crawl of old documents, comparing etags and last-modified */
|
||||
public RecrawlMetadata recrawl(CrawlDataReference oldCrawlData,
|
||||
DomainCookies cookies,
|
||||
SimpleRobotRules robotsRules,
|
||||
CrawlDelayTimer delayTimer)
|
||||
throws InterruptedException {
|
||||
@@ -127,6 +134,7 @@ public class CrawlerRevisitor {
|
||||
}
|
||||
// Add a WARC record so we don't repeat this
|
||||
warcRecorder.writeReferenceCopy(url,
|
||||
cookies,
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBodyBytes,
|
||||
@@ -151,11 +159,13 @@ public class CrawlerRevisitor {
|
||||
else if (result instanceof HttpFetchResult.ResultException) {
|
||||
errors++;
|
||||
}
|
||||
|
||||
recrawled++;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Recrawl summary {}: {} recrawled, {} retained, {} errors, {} skipped",
|
||||
crawlFrontier.getDomain(), recrawled, retained, errors, skipped);
|
||||
|
||||
return new RecrawlMetadata(size, errors, skipped);
|
||||
}
|
||||
|
||||
|
@@ -6,6 +6,7 @@ import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Objects;
|
||||
|
||||
public record DocumentWithReference(
|
||||
@Nullable CrawledDocument doc,
|
||||
@@ -33,8 +34,22 @@ public record DocumentWithReference(
|
||||
return false;
|
||||
if (doc == null)
|
||||
return false;
|
||||
if (doc.documentBodyBytes.length == 0)
|
||||
return false;
|
||||
if (doc.documentBodyBytes.length == 0) {
|
||||
if (doc.httpStatus < 300) {
|
||||
return resultOk.bytesLength() == 0;
|
||||
}
|
||||
else if (doc.httpStatus == 301 || doc.httpStatus == 302 || doc.httpStatus == 307) {
|
||||
@Nullable
|
||||
String docLocation = doc.getHeader("Location");
|
||||
@Nullable
|
||||
String resultLocation = resultOk.header("Location");
|
||||
|
||||
return Objects.equals(docLocation, resultLocation);
|
||||
}
|
||||
else {
|
||||
return doc.httpStatus == resultOk.statusCode();
|
||||
}
|
||||
}
|
||||
|
||||
return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw());
|
||||
}
|
||||
|
@@ -41,6 +41,8 @@ dependencies {
|
||||
implementation libs.snakeyaml
|
||||
implementation libs.zstd
|
||||
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
@@ -6,6 +6,7 @@ public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
||||
@@ -19,4 +20,9 @@ public class ContentTypes {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isBinary(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
return lcHeader.startsWith("application/pdf");
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
String ctLc = contentType.toLowerCase();
|
||||
|
||||
// Permit all plain text content types
|
||||
if (ctLc.startsWith("text/"))
|
||||
return true;
|
||||
// PDF
|
||||
else if (ctLc.startsWith("application/pdf"))
|
||||
return true;
|
||||
else if (ctLc.startsWith("x-marginalia/"))
|
||||
return true;
|
||||
|
||||
|
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class ContentTypeLogic {
|
||||
|
||||
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
|
||||
"application/rss+xml",
|
||||
"application/x-rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/pdf",
|
||||
"x-rss+xml"
|
||||
);
|
||||
private boolean allowAllContentTypes = false;
|
||||
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
|
||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||
String pathLowerCase = url.path.toLowerCase();
|
||||
|
||||
if (probableHtmlPattern.test(pathLowerCase))
|
||||
if (probableGoodPattern.test(pathLowerCase))
|
||||
return false;
|
||||
|
||||
return probableBinaryPattern.test(pathLowerCase);
|
||||
|
@@ -1,6 +1,9 @@
|
||||
package nu.marginalia.model.body;
|
||||
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.apache.hc.core5.http.message.BasicHeader;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@@ -11,8 +14,10 @@ import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
*/
|
||||
@@ -56,7 +61,7 @@ public sealed interface HttpFetchResult {
|
||||
*/
|
||||
record ResultOk(URI uri,
|
||||
int statusCode,
|
||||
HttpHeaders headers,
|
||||
Header[] headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw, // raw data for the entire response including headers
|
||||
int bytesStart,
|
||||
@@ -67,18 +72,19 @@ public sealed interface HttpFetchResult {
|
||||
this(uri, status, convertHeaders(headers), ipAddress, bytes, bytesStart, length);
|
||||
}
|
||||
|
||||
private static HttpHeaders convertHeaders(MessageHeaders messageHeaders) {
|
||||
Map<String, List<String>> inputMap = messageHeaders.map();
|
||||
Map<String, List<String>> filteredMap = new HashMap<>(Math.max(4, inputMap.size()));
|
||||
private static Header[] convertHeaders(MessageHeaders messageHeaders) {
|
||||
List<Header> headers = new ArrayList<>(12);
|
||||
|
||||
inputMap.forEach((k, v) -> {
|
||||
messageHeaders.map().forEach((k, v) -> {
|
||||
if (k.isBlank()) return;
|
||||
if (!Character.isAlphabetic(k.charAt(0))) return;
|
||||
|
||||
filteredMap.put(k, v);
|
||||
for (var value : v) {
|
||||
headers.add(new BasicHeader(k, value));
|
||||
}
|
||||
});
|
||||
|
||||
return HttpHeaders.of(filteredMap, (k,v) -> true);
|
||||
return headers.toArray(new Header[0]);
|
||||
}
|
||||
|
||||
public boolean isOk() {
|
||||
@@ -108,7 +114,13 @@ public sealed interface HttpFetchResult {
|
||||
|
||||
@Nullable
|
||||
public String header(String name) {
|
||||
return headers.firstValue(name).orElse(null);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase(name)) {
|
||||
String headerValue = header.getValue();
|
||||
return headerValue;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -132,6 +144,12 @@ public sealed interface HttpFetchResult {
|
||||
}
|
||||
}
|
||||
|
||||
record ResultRedirect(EdgeUrl url) implements HttpFetchResult {
|
||||
public boolean isOk() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Fetching resulted in a HTTP 304, the remote content is identical to
|
||||
* our reference copy. This will be replaced with a Result304ReplacedWithReference
|
||||
* at a later stage.
|
||||
|
@@ -102,7 +102,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private String getHeader(String header) {
|
||||
public String getHeader(String header) {
|
||||
if (headers == null) {
|
||||
return null;
|
||||
}
|
||||
|
@@ -165,12 +165,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
String etag = null;
|
||||
String lastModified = null;
|
||||
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Has-Cookies")) {
|
||||
hasCookies = hasCookies || header.getValue().equals("1");
|
||||
}
|
||||
else if (header.getName().equalsIgnoreCase("ETag")) {
|
||||
etag = header.getValue();
|
||||
}
|
||||
else if (header.getName().equalsIgnoreCase("Last-Modified")) {
|
||||
lastModified = header.getValue();
|
||||
}
|
||||
else {
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
String headersStr = headersStrBuilder.toString();
|
||||
|
||||
|
||||
@@ -178,14 +192,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
domain,
|
||||
response.target(),
|
||||
fetchOk.ipAddress(),
|
||||
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
||||
hasCookies,
|
||||
fetchOk.statusCode(),
|
||||
response.date(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
headersStr,
|
||||
headers.firstValue("ETag").orElse(null),
|
||||
headers.firstValue("Last-Modified").orElse(null)
|
||||
etag,
|
||||
lastModified
|
||||
));
|
||||
}
|
||||
|
||||
|
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -341,12 +346,15 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
|
||||
String headersStr;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||
hasCookies = true;
|
||||
}
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
|
||||
@@ -355,7 +363,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
domain,
|
||||
response.target(),
|
||||
fetchOk.ipAddress(),
|
||||
"1".equals(headers.firstValue("X-Cookies").orElse("0")),
|
||||
hasCookies,
|
||||
fetchOk.statusCode(),
|
||||
response.date().toEpochMilli(),
|
||||
contentType,
|
||||
|
@@ -0,0 +1,146 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplContentTypeProbeTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
private static EdgeUrl contentTypeHtmlUrl;
|
||||
private static EdgeUrl contentTypeBinaryUrl;
|
||||
private static EdgeUrl redirectUrl;
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl onlyGetAllowedUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||
|
||||
contentTypeHtmlUrl = new EdgeUrl("http://localhost:18089/test.html.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeHtmlUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
|
||||
contentTypeBinaryUrl = new EdgeUrl("http://localhost:18089/test.bad.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(contentTypeBinaryUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/octet-stream")
|
||||
.withStatus(200)));
|
||||
|
||||
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
|
||||
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
|
||||
onlyGetAllowedUrl = new EdgeUrl("http://localhost:18089/onlyget.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withStatus(405))); // Method Not Allowed
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(onlyGetAllowedUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitPath() throws URISyntaxException {
|
||||
var result = fetcher.probeContentType(new EdgeUrl("https://localhost/test.html"), new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtmlShortcircuitTags() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), new ContentTags("a", "b"));
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.NoOp.class, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeHtml() {
|
||||
var result = fetcher.probeContentType(contentTypeHtmlUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBinary() {
|
||||
var result = fetcher.probeContentType(contentTypeBinaryUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.BadContentType("application/octet-stream", 200), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeRedirect() {
|
||||
var result = fetcher.probeContentType(redirectUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Redirect(contentTypeHtmlUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeContentTypeBadHttpStatus() {
|
||||
var result = fetcher.probeContentType(badHttpStatusUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.HttpError(500, "Bad status code"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOnlyGetAllowed() {
|
||||
var result = fetcher.probeContentType(onlyGetAllowedUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertEquals(new HttpFetcher.ContentTypeProbeResult.Ok(onlyGetAllowedUrl), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTimeout() {
|
||||
var result = fetcher.probeContentType(timeoutUrl, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
Assertions.assertInstanceOf(HttpFetcher.ContentTypeProbeResult.Timeout.class, result);
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,95 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplDomainProbeTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo("/timeout"))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000))); // 10 seconds delay to simulate timeout
|
||||
|
||||
wireMockServer.start();
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout");
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
fetcher.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomain() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainProtoUpgrade() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("http://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://www.marginalia.nu/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainRedirect() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("http://search.marginalia.nu/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Redirect(new EdgeDomain("marginalia-search.com")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainOnlyGET() throws URISyntaxException {
|
||||
// This test is to check if the domain probe only allows GET requests
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://marginalia-search.com/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Ok(new EdgeUrl("https://marginalia-search.com/")), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainError() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(new EdgeUrl("https://invalid.example.com/"));
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Error during domain probe"), result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProbeDomainTimeout() throws URISyntaxException {
|
||||
var result = fetcher.probeDomain(timeoutUrl);
|
||||
Assertions.assertEquals(new HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Timeout during domain probe"), result);
|
||||
}
|
||||
}
|
@@ -0,0 +1,398 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.client.WireMock;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class HttpFetcherImplFetchTest {
|
||||
|
||||
private HttpFetcherImpl fetcher;
|
||||
private static WireMockServer wireMockServer;
|
||||
|
||||
private static String etag = "etag";
|
||||
private static String lastModified = "Wed, 21 Oct 2024 07:28:00 GMT";
|
||||
|
||||
private static EdgeUrl okUrl;
|
||||
private static EdgeUrl okUrlSetsCookie;
|
||||
private static EdgeUrl okRangeResponseUrl;
|
||||
private static EdgeUrl okUrlWith304;
|
||||
|
||||
private static EdgeUrl timeoutUrl;
|
||||
private static EdgeUrl redirectUrl;
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl keepAliveUrl;
|
||||
|
||||
private static EdgeUrl pdfUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
new WireMockServer(WireMockConfiguration.wireMockConfig()
|
||||
.port(18089));
|
||||
|
||||
timeoutUrl = new EdgeUrl("http://localhost:18089/timeout.bin");
|
||||
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000)
|
||||
)); // 15 seconds delay to simulate timeout
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(timeoutUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withFixedDelay(15000)
|
||||
.withBody("Hello World")
|
||||
)); // 15 seconds delay to simulate timeout
|
||||
|
||||
redirectUrl = new EdgeUrl("http://localhost:18089/redirect.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(redirectUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Location", "http://localhost:18089/test.html.bin")
|
||||
.withStatus(301)));
|
||||
|
||||
badHttpStatusUrl = new EdgeUrl("http://localhost:18089/badstatus");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(badHttpStatusUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(500)));
|
||||
|
||||
okUrl = new EdgeUrl("http://localhost:18089/ok.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlSetsCookie = new EdgeUrl("http://localhost:18089/okSetCookie.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlSetsCookie.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("Set-Cookie", "test=1")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
okUrlWith304 = new EdgeUrl("http://localhost:18089/ok304.bin");
|
||||
wireMockServer.stubFor(WireMock.head(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("ETag", etag)
|
||||
.withHeader("Last-Modified", lastModified)
|
||||
.withStatus(304)));
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okUrlWith304.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withHeader("ETag", etag)
|
||||
.withHeader("Last-Modified", lastModified)
|
||||
.withStatus(304)));
|
||||
|
||||
okRangeResponseUrl = new EdgeUrl("http://localhost:18089/okRangeResponse.bin");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(okRangeResponseUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Range", "bytes 0-100/200")
|
||||
.withBody("Hello World")
|
||||
.withStatus(206)));
|
||||
|
||||
keepAliveUrl = new EdgeUrl("http://localhost:18089/keepalive.bin");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(keepAliveUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "text/html")
|
||||
.withStatus(200)
|
||||
.withHeader("Keep-Alive", "max=4, timeout=30")
|
||||
.withBody("Hello")
|
||||
));
|
||||
|
||||
|
||||
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/pdf")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDownAll() {
|
||||
wireMockServer.stop();
|
||||
}
|
||||
|
||||
|
||||
WarcRecorder warcRecorder;
|
||||
Path warcFile;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
fetcher = new HttpFetcherImpl(new UserAgent("test.marginalia.nu", "test.marginalia.nu"));
|
||||
warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc");
|
||||
warcRecorder = new WarcRecorder(warcFile);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
var stats = fetcher.getPoolStats();
|
||||
assertEquals(0, stats.getLeased());
|
||||
assertEquals(0, stats.getPending());
|
||||
|
||||
System.out.println(stats);
|
||||
|
||||
fetcher.close();
|
||||
warcRecorder.close();
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFoo() {
|
||||
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(100));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("0", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOkSetsCookie() throws IOException {
|
||||
var cookies = new DomainCookies();
|
||||
var result = fetcher.fetchContent(okUrlSetsCookie, warcRecorder, cookies, new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
WarcResponse response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("1", response.http().headers().first("X-Has-Cookies").orElse("0"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk304_NoProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOk304_FullProbe() {
|
||||
var result = fetcher.fetchContent(okUrlWith304, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), new ContentTags(etag, lastModified), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.Result304Raw.class, result);
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadStatus_NoProbe() throws IOException {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadStatus_FullProbe() {
|
||||
var result = fetcher.fetchContent(badHttpStatusUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertFalse(result.isOk());
|
||||
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRedirect_NoProbe() throws URISyntaxException, IOException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRedirect_FullProbe() throws URISyntaxException {
|
||||
var result = fetcher.fetchContent(redirectUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultRedirect.class, result);
|
||||
assertEquals(new EdgeUrl("http://localhost:18089/test.html.bin"), ((HttpFetchResult.ResultRedirect) result).url());
|
||||
|
||||
System.out.println(result);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFetchTimeout_NoProbe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||
|
||||
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||
|
||||
var records = getWarcRecords();
|
||||
Assertions.assertEquals(1, records.size());
|
||||
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRangeResponse() throws IOException {
|
||||
var result = fetcher.fetchContent(okRangeResponseUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
|
||||
List<WarcRecord> warcRecords = getWarcRecords();
|
||||
assertEquals(2, warcRecords.size());
|
||||
Assertions.assertInstanceOf(WarcRequest.class, warcRecords.get(0));
|
||||
Assertions.assertInstanceOf(WarcResponse.class, warcRecords.get(1));
|
||||
|
||||
var response = (WarcResponse) warcRecords.get(1);
|
||||
assertEquals("length", response.headers().first("WARC-Truncated").orElse(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchTimeout_Probe() throws IOException, URISyntaxException {
|
||||
Instant requestStart = Instant.now();
|
||||
var result = fetcher.fetchContent(timeoutUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
Instant requestEnd = Instant.now();
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultException.class, result);
|
||||
|
||||
|
||||
// Verify that we are actually timing out, and not blocking on the request until it finishes (which would be a bug),
|
||||
// the request will take 15 seconds to complete, so we should be able to timeout before that, something like 10 seconds and change;
|
||||
// but we'll verify that it is less than 15 seconds to make the test less fragile.
|
||||
|
||||
Assertions.assertTrue(requestEnd.isBefore(requestStart.plusSeconds(15)), "Request should have taken less than 15 seconds");
|
||||
|
||||
var records = getWarcRecords();
|
||||
Assertions.assertEquals(1, records.size());
|
||||
Assertions.assertInstanceOf(WarcXEntityRefused.class, records.getFirst());
|
||||
WarcXEntityRefused entity = (WarcXEntityRefused) records.getFirst();
|
||||
assertEquals(WarcXEntityRefused.documentProbeTimeout, entity.profile());
|
||||
assertEquals(timeoutUrl.asURI(), entity.targetURI());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeepaliveUrl() {
|
||||
// mostly for smoke testing and debugger utility
|
||||
var result = fetcher.fetchContent(keepAliveUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.DISABLED);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdf() {
|
||||
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
|
||||
private List<WarcRecord> getWarcRecords() throws IOException {
|
||||
List<WarcRecord> records = new ArrayList<>();
|
||||
|
||||
System.out.println(Files.readString(warcFile));
|
||||
|
||||
try (var reader = new WarcReader(warcFile)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
WarcXEntityRefused.register(reader);
|
||||
|
||||
for (var record : reader) {
|
||||
// Load the body, we need to do this before we close the reader to have access to the content.
|
||||
if (record instanceof WarcRequest req) {
|
||||
req.http();
|
||||
} else if (record instanceof WarcResponse rsp) {
|
||||
rsp.http();
|
||||
}
|
||||
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -1,9 +1,12 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -13,8 +16,6 @@ import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
@@ -30,8 +31,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClient.newBuilder()
|
||||
.build();
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileName = Files.createTempFile("test", ".warc.gz");
|
||||
outputFile = Files.createTempFile("test", ".warc.gz");
|
||||
@@ -45,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
@Test
|
||||
void run() throws IOException, URISyntaxException {
|
||||
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||
@@ -55,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||
|
||||
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||
}
|
||||
|
||||
@@ -78,11 +78,10 @@ class CrawlerWarcResynchronizerTest {
|
||||
}
|
||||
|
||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
var req = HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI(url))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build();
|
||||
recorder.fetch(httpClient, req);
|
||||
HttpGet request = new HttpGet(url);
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
recorder.fetch(httpClient, new DomainCookies(), request);
|
||||
}
|
||||
}
|
@@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -32,7 +32,6 @@ class ContentTypeProberTest {
|
||||
static EdgeUrl timeoutEndpoint;
|
||||
|
||||
static Path warcFile;
|
||||
static WarcRecorder recorder;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
@@ -80,21 +79,17 @@ class ContentTypeProberTest {
|
||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||
|
||||
fetcher = new HttpFetcherImpl("test");
|
||||
recorder = new WarcRecorder(warcFile, new Cookies());
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
server.stop(0);
|
||||
fetcher.close();
|
||||
recorder.close();
|
||||
|
||||
Files.deleteIfExists(warcFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeOk() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -103,16 +98,16 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeRedir() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(htmlRedirEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||
assertEquals(result, new HttpFetcher.ContentTypeProbeResult.Redirect(htmlEndpoint));
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeBad() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(binaryEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
@@ -121,7 +116,7 @@ class ContentTypeProberTest {
|
||||
|
||||
@Test
|
||||
void probeContentTypeTimeout() throws Exception {
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, recorder, ContentTags.empty());
|
||||
HttpFetcher.ContentTypeProbeResult result = fetcher.probeContentType(timeoutEndpoint, new DomainCookies(), new CrawlDelayTimer(50), ContentTags.empty());
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
|
@@ -1,8 +1,11 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.WarcReader;
|
||||
import org.netpreserve.jwarc.WarcRequest;
|
||||
@@ -10,8 +13,6 @@ import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
@@ -50,11 +51,19 @@ class WarcRecorderFakeServerTest {
|
||||
os.write("<html><body>hello</body></html>".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
os.write(":D".getBytes());
|
||||
os.write(":".getBytes());
|
||||
os.flush();
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(2);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
os.write("D".getBytes());
|
||||
os.flush();
|
||||
}
|
||||
exchange.close();
|
||||
@@ -75,30 +84,27 @@ class WarcRecorderFakeServerTest {
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClient.newBuilder().build();
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
|
||||
client.close();
|
||||
Files.delete(fileNameWarc);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void fetchFast() throws Exception {
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("http://localhost:14510/fast"))
|
||||
.timeout(Duration.ofSeconds(1))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build()
|
||||
);
|
||||
HttpGet request = new HttpGet("http://localhost:14510/fast");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -118,13 +124,15 @@ class WarcRecorderFakeServerTest {
|
||||
@Test
|
||||
public void fetchSlow() throws Exception {
|
||||
Instant start = Instant.now();
|
||||
|
||||
HttpGet request = new HttpGet("http://localhost:14510/slow");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("http://localhost:14510/slow"))
|
||||
.timeout(Duration.ofSeconds(1))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build()
|
||||
new DomainCookies(),
|
||||
request,
|
||||
Duration.ofSeconds(1)
|
||||
);
|
||||
Instant end = Instant.now();
|
||||
|
||||
@@ -141,12 +149,14 @@ class WarcRecorderFakeServerTest {
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println(
|
||||
Files.readString(fileNameWarc));
|
||||
System.out.println(sampleData);
|
||||
|
||||
// Timeout is set to 1 second, but the server will take 5 seconds to respond,
|
||||
// so we expect the request to take 1s and change before it times out.
|
||||
|
||||
Assertions.assertTrue(Duration.between(start, end).toMillis() < 2000);
|
||||
Assertions.assertTrue(Duration.between(start, end).toMillis() < 3000);
|
||||
}
|
||||
|
||||
}
|
@@ -2,11 +2,14 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -17,30 +20,29 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class WarcRecorderTest {
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
Path fileNameSlop;
|
||||
WarcRecorder client;
|
||||
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = HttpClient.newBuilder().build();
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
fileNameSlop = Files.createTempFile("test", ".slop.zip");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -51,13 +53,12 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build()
|
||||
);
|
||||
|
||||
HttpGet request = new HttpGet("https://www.marginalia.nu/");
|
||||
request.addHeader("User-agent", "test.marginalia.nu");
|
||||
request.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@@ -78,8 +79,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -102,8 +104,9 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
null,
|
||||
@@ -114,8 +117,9 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
new DomainCookies(),
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>".getBytes(),
|
||||
@@ -138,35 +142,46 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
HttpGet request1 = new HttpGet("https://www.marginalia.nu/");
|
||||
request1.addHeader("User-agent", "test.marginalia.nu");
|
||||
request1.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
client.fetch(httpClient, new DomainCookies(), request1);
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
HttpGet request2 = new HttpGet("https://www.marginalia.nu/log/");
|
||||
request2.addHeader("User-agent", "test.marginalia.nu");
|
||||
request2.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
client.fetch(httpClient, new DomainCookies(), request2);
|
||||
|
||||
HttpGet request3 = new HttpGet("https://www.marginalia.nu/sanic.png");
|
||||
request3.addHeader("User-agent", "test.marginalia.nu");
|
||||
request3.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
|
||||
request4.addHeader("User-agent", "test.marginalia.nu");
|
||||
request4.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request4);
|
||||
|
||||
SlopCrawlDataRecord.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
new UserAgent("test", "test"),
|
||||
fileNameWarc,
|
||||
fileNameParquet);
|
||||
fileNameSlop);
|
||||
|
||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||
assertEquals(2, urls.size());
|
||||
List<String> urls;
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
|
||||
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
|
||||
}
|
||||
|
||||
assertEquals(3, urls.size());
|
||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||
// sanic.jpg gets filtered out for its bad mime type
|
||||
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
|
||||
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -31,7 +32,7 @@ class HttpFetcherTest {
|
||||
void fetchUTF8() throws Exception {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
@@ -49,7 +50,7 @@ class HttpFetcherTest {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, new DomainCookies(), new CrawlDelayTimer(100), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||
System.out.println(bodyOk.contentType());
|
||||
}
|
||||
|
@@ -15,6 +15,9 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
||||
import org.apache.hc.client5.http.cookie.CookieStore;
|
||||
import org.apache.hc.core5.http.Header;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@@ -24,7 +27,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@@ -120,7 +122,7 @@ public class CrawlerMockFetcherTest {
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||
|
||||
@Override
|
||||
public Cookies getCookies() { return new Cookies();}
|
||||
public CookieStore getCookies() { return new BasicCookieStore();}
|
||||
|
||||
@Override
|
||||
public void clearCookies() {}
|
||||
@@ -132,13 +134,7 @@ public class CrawlerMockFetcherTest {
|
||||
}
|
||||
|
||||
@Override
|
||||
public ContentTypeProbeResult probeContentType(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
||||
logger.info("Probing {}", url);
|
||||
return new HttpFetcher.ContentTypeProbeResult.Ok(url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
|
||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, DomainCookies cookies, CrawlDelayTimer timer, ContentTags tags, ProbeType probeType) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
byte[] bodyBytes = mockData.get(url).documentBodyBytes;
|
||||
@@ -147,7 +143,7 @@ public class CrawlerMockFetcherTest {
|
||||
return new HttpFetchResult.ResultOk(
|
||||
url.asURI(),
|
||||
200,
|
||||
HttpHeaders.of(Map.of(), (k,v)->true),
|
||||
new Header[0],
|
||||
"127.0.0.1",
|
||||
bodyBytes,
|
||||
0,
|
||||
|
@@ -5,7 +5,6 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@@ -16,7 +15,7 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.netpreserve.jwarc.*;
|
||||
@@ -37,16 +36,16 @@ class CrawlerRetreiverTest {
|
||||
private HttpFetcher httpFetcher;
|
||||
|
||||
Path tempFileWarc1;
|
||||
Path tempFileParquet1;
|
||||
Path tempFileSlop1;
|
||||
Path tempFileWarc2;
|
||||
Path tempFileParquet2;
|
||||
Path tempFileSlop2;
|
||||
Path tempFileWarc3;
|
||||
Path tempFileDb;
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
||||
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
||||
tempFileSlop1 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
tempFileSlop2 = Files.createTempFile("crawling-process", ".slop.zip");
|
||||
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
||||
|
||||
}
|
||||
@@ -62,14 +61,14 @@ class CrawlerRetreiverTest {
|
||||
if (tempFileWarc1 != null) {
|
||||
Files.deleteIfExists(tempFileWarc1);
|
||||
}
|
||||
if (tempFileParquet1 != null) {
|
||||
Files.deleteIfExists(tempFileParquet1);
|
||||
if (tempFileSlop1 != null) {
|
||||
Files.deleteIfExists(tempFileSlop1);
|
||||
}
|
||||
if (tempFileWarc2 != null) {
|
||||
Files.deleteIfExists(tempFileWarc2);
|
||||
}
|
||||
if (tempFileParquet2 != null) {
|
||||
Files.deleteIfExists(tempFileParquet2);
|
||||
if (tempFileSlop2 != null) {
|
||||
Files.deleteIfExists(tempFileSlop2);
|
||||
}
|
||||
if (tempFileWarc3 != null) {
|
||||
Files.deleteIfExists(tempFileWarc3);
|
||||
@@ -180,7 +179,7 @@ class CrawlerRetreiverTest {
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2, new Cookies())
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -224,9 +223,9 @@ class CrawlerRetreiverTest {
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -277,9 +276,9 @@ class CrawlerRetreiverTest {
|
||||
assertFalse(frontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||
assertTrue(frontier.isKnown(new EdgeUrl("https://www.marginalia.nu/log/06-optimization/")));
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -293,7 +292,7 @@ class CrawlerRetreiverTest {
|
||||
// redirects to https://www.marginalia.nu/log/06-optimization.gmi/ (note the trailing slash)
|
||||
//
|
||||
// Ensure that the redirect is followed, and that the trailing slash is added
|
||||
// to the url as reported in the parquet file.
|
||||
// to the url as reported in the Slop file.
|
||||
|
||||
var fetchedUrls =
|
||||
data.stream()
|
||||
@@ -326,9 +325,9 @@ class CrawlerRetreiverTest {
|
||||
tempFileWarc1 = Files.createTempFile("crawling-process", ".warc");
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
if (stream.next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
@@ -373,11 +372,11 @@ class CrawlerRetreiverTest {
|
||||
tempFileWarc2 = Files.createTempFile("crawling-process", ".warc.gz");
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
doCrawlWithReferenceStream(specs,
|
||||
new CrawlDataReference(tempFileParquet1)
|
||||
new CrawlDataReference(tempFileSlop1)
|
||||
);
|
||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||
convertToSlop(tempFileWarc2, tempFileSlop2);
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc2)) {
|
||||
WarcXResponseReference.register(reader);
|
||||
@@ -396,7 +395,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -411,9 +410,9 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
}
|
||||
|
||||
private void convertToParquet(Path tempFileWarc2, Path tempFileParquet2) {
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test", "test"), tempFileWarc2, tempFileParquet2);
|
||||
private void convertToSlop(Path tempFileWarc2, Path tempFileSlop2) throws IOException {
|
||||
SlopCrawlDataRecord.convertWarc("www.marginalia.nu",
|
||||
new UserAgent("test", "test"), tempFileWarc2, tempFileSlop2);
|
||||
}
|
||||
|
||||
|
||||
@@ -436,9 +435,9 @@ class CrawlerRetreiverTest {
|
||||
|
||||
doCrawl(tempFileWarc1, specs);
|
||||
|
||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||
convertToSlop(tempFileWarc1, tempFileSlop1);
|
||||
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1)) {
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(tempFileSlop1)) {
|
||||
while (stream.hasNext()) {
|
||||
var doc = stream.next();
|
||||
data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc);
|
||||
@@ -449,14 +448,14 @@ class CrawlerRetreiverTest {
|
||||
|
||||
System.out.println("---");
|
||||
|
||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
||||
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileSlop1));
|
||||
|
||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3, new Cookies())
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@@ -465,7 +464,7 @@ class CrawlerRetreiverTest {
|
||||
resync.run(tempFileWarc2);
|
||||
|
||||
assertTrue(revisitCrawlFrontier.addKnown(new EdgeUrl("https://www.marginalia.nu/")));
|
||||
convertToParquet(tempFileWarc3, tempFileParquet2);
|
||||
convertToSlop(tempFileWarc3, tempFileSlop2);
|
||||
|
||||
|
||||
try (var reader = new WarcReader(tempFileWarc3)) {
|
||||
@@ -485,7 +484,7 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileParquet2)) {
|
||||
try (var ds = SerializableCrawlDataStream.openDataStream(tempFileSlop2)) {
|
||||
while (ds.hasNext()) {
|
||||
var doc = ds.next();
|
||||
if (doc instanceof CrawledDomain dr) {
|
||||
@@ -507,7 +506,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||
@@ -519,7 +518,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@NotNull
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||
|
@@ -1,9 +1,16 @@
|
||||
This is a bit of a hack!
|
||||
|
||||
This class exists to let tailwind we're using these classes even though they aren't visible in the code,
|
||||
as we sometimes generate classes from Java code!
|
||||
as we sometimes generate classes from Java code or javascript!
|
||||
|
||||
<i class="text-blue-800 bg-blue-50 dark:text-blue-200 dark:bg-blue-950"></i>
|
||||
<i class="text-green-800 bg-green-50 dark:text-green-200 dark:bg-green-950"></i>
|
||||
<i class="text-purple-800 bg-purple-50 dark:text-purple-200 dark:bg-purple-950"></i>
|
||||
<i class="text-blue-950 bg-gray-100 dark:text-blue-50 dark:bg-gray-900"></i>
|
||||
<span class="hover:bg-gray-300 "></span>
|
||||
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" checked>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full">
|
||||
</div>
|
||||
</label>
|
@@ -13,7 +13,7 @@
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
autofocus
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@@ -21,13 +21,13 @@
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@endif
|
||||
|
||||
<div id="searchSuggestions" class="text-sm absolute top-2 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-200 rounded-lg shadow-lg hidden"></div>
|
||||
<div aria-hidden="true" id="searchSuggestions" class="text-sm absolute top-3 mt-10 w-96 bg-white dark:bg-black border dark:border-gray-600 border-gray-300 rounded-lg shadow-lg hidden"></div>
|
||||
|
||||
<button class="px-4 py-2 bg-margeblue text-white ml-2 rounded whitespace-nowrap active:text-slate-200">
|
||||
<i class="fas fa-search text-sm sm:mr-3"></i>
|
||||
|
@@ -43,13 +43,13 @@ function displaySuggestions(suggestions) {
|
||||
}
|
||||
|
||||
suggestionsContainer.innerHTML = suggestions.map((suggestion, index) => `
|
||||
<div
|
||||
class="suggestion px-4 py-2 cursor-pointer hover:bg-gray-100 ${index === selectedIndex ? 'bg-blue-50' : ''}"
|
||||
data-index="${index}"
|
||||
>
|
||||
${suggestion}
|
||||
</div>
|
||||
`).join('');
|
||||
<label class="suggestion group block relative">
|
||||
<input type="radio" name="suggestion" class="peer hidden" ${index === selectedIndex ? 'checked' : ''}>
|
||||
<div class="px-4 py-2 cursor-pointer dark:peer-checked:bg-gray-700 dark:hover:bg-gray-700 peer-checked:bg-gray-300 hover:bg-gray-300 w-full" data-index="${index}">
|
||||
${suggestion}
|
||||
</div>
|
||||
</label>
|
||||
`).join('');
|
||||
|
||||
suggestionsContainer.classList.remove('hidden');
|
||||
|
||||
|
@@ -10,7 +10,7 @@ import static com.google.inject.name.Names.named;
|
||||
|
||||
public class AssistantModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
|
||||
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions2.txt.gz"));
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
@@ -0,0 +1,459 @@
|
||||
package nu.marginalia.assistant.suggest;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** Unhinged data structure for fast prefix searching.
|
||||
*/
|
||||
public class PrefixSearchStructure {
|
||||
// Core data structures
|
||||
private final HashMap<String, TIntArrayList> prefixIndex; // Short prefix index (up to 8 chars)
|
||||
private final HashMap<String, TIntArrayList> longPrefixIndex; // Long prefix index (9-16 chars)
|
||||
private final ArrayList<String> words; // All words by ID
|
||||
private final TIntArrayList wordScores; // Scores for all words
|
||||
|
||||
// Configuration
|
||||
private static final int SHORT_PREFIX_LENGTH = 8;
|
||||
private static final int MAX_INDEXED_PREFIX_LENGTH = 16;
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
// For sorting efficiency
|
||||
private static class WordScorePair {
|
||||
final String word;
|
||||
final int score;
|
||||
|
||||
WordScorePair(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new PrefixTrie for typeahead search.
|
||||
*/
|
||||
public PrefixSearchStructure() {
|
||||
prefixIndex = new HashMap<>(1024);
|
||||
longPrefixIndex = new HashMap<>(1024);
|
||||
words = new ArrayList<>(1024);
|
||||
wordScores = new TIntArrayList(1024);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a prefix to the index.
|
||||
*/
|
||||
private void indexPrefix(String word, int wordId) {
|
||||
// Index short prefixes
|
||||
for (int i = 1; i <= Math.min(word.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// Index longer prefixes
|
||||
for (int i = SHORT_PREFIX_LENGTH + 1; i <= Math.min(word.length(), MAX_INDEXED_PREFIX_LENGTH); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
TIntArrayList wordIds = longPrefixIndex.computeIfAbsent(
|
||||
prefix, k -> new TIntArrayList(8));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
|
||||
// If the word contains spaces, also index by each term for multi-word queries
|
||||
if (word.contains(" ")) {
|
||||
String[] terms = word.split("\\s+");
|
||||
for (String term : terms) {
|
||||
if (term.length() >= 2) {
|
||||
for (int i = 1; i <= Math.min(term.length(), SHORT_PREFIX_LENGTH); i++) {
|
||||
String termPrefix = "t:" + term.substring(0, i);
|
||||
TIntArrayList wordIds = prefixIndex.computeIfAbsent(
|
||||
termPrefix, k -> new TIntArrayList(16));
|
||||
wordIds.add(wordId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a word with its associated score.
|
||||
*/
|
||||
public void insert(String word, int score) {
|
||||
if (word == null || word.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add to the word list and index
|
||||
int wordId = words.size();
|
||||
words.add(word);
|
||||
wordScores.add(score);
|
||||
indexPrefix(word, wordId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the top k completions for a given prefix.
|
||||
*/
|
||||
public List<ScoredSuggestion> getTopCompletions(String prefix, int k) {
|
||||
if (prefix == null || prefix.isEmpty()) {
|
||||
// Return top k words by score
|
||||
return getTopKWords(k);
|
||||
}
|
||||
|
||||
// Check if this is a term search (t:) - for searching within multi-word items
|
||||
boolean isTermSearch = false;
|
||||
if (prefix.startsWith("t:") && prefix.length() > 2) {
|
||||
isTermSearch = true;
|
||||
prefix = prefix.substring(2);
|
||||
}
|
||||
|
||||
// 1. Fast path for short prefixes
|
||||
if (prefix.length() <= SHORT_PREFIX_LENGTH) {
|
||||
String lookupPrefix = isTermSearch ? "t:" + prefix : prefix;
|
||||
TIntArrayList wordIds = prefixIndex.get(lookupPrefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fast path for long prefixes (truncate to MAX_INDEXED_PREFIX_LENGTH)
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
// Try exact match in longPrefixIndex first
|
||||
if (prefix.length() <= MAX_INDEXED_PREFIX_LENGTH) {
|
||||
TIntArrayList wordIds = longPrefixIndex.get(prefix);
|
||||
if (wordIds != null) {
|
||||
return getTopKFromWordIds(wordIds, k);
|
||||
}
|
||||
}
|
||||
|
||||
// If prefix is longer than MAX_INDEXED_PREFIX_LENGTH, truncate and filter
|
||||
if (prefix.length() > MAX_INDEXED_PREFIX_LENGTH) {
|
||||
String truncatedPrefix = prefix.substring(0, MAX_INDEXED_PREFIX_LENGTH);
|
||||
TIntArrayList candidateIds = longPrefixIndex.get(truncatedPrefix);
|
||||
if (candidateIds != null) {
|
||||
// Filter candidates by the full prefix
|
||||
return getFilteredTopKFromWordIds(candidateIds, prefix, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Optimized fallback for long prefixes - use prefix tree for segments
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
|
||||
// Handle multi-segment queries by finding candidates from first 8 chars
|
||||
if (prefix.length() > SHORT_PREFIX_LENGTH) {
|
||||
String shortPrefix = prefix.substring(0, Math.min(prefix.length(), SHORT_PREFIX_LENGTH));
|
||||
TIntArrayList candidates = prefixIndex.get(shortPrefix);
|
||||
|
||||
if (candidates != null) {
|
||||
return getFilteredTopKFromWordIds(candidates, prefix, k);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Last resort - optimized binary search in sorted segments
|
||||
return findByBinarySearchPrefix(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words by score.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKWords(int k) {
|
||||
// Create pairs of (score, wordId)
|
||||
int[][] pairs = new int[words.size()][2];
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
pairs[i][0] = wordScores.get(i);
|
||||
pairs[i][1] = i;
|
||||
}
|
||||
|
||||
// Sort by score (descending)
|
||||
Arrays.sort(pairs, (a, b) -> Integer.compare(b[0], a[0]));
|
||||
|
||||
// Take top k
|
||||
List<ScoredSuggestion> results = new ArrayList<>();
|
||||
for (int i = 0; i < Math.min(k, pairs.length); i++) {
|
||||
String word = words.get(pairs[i][1]);
|
||||
int score = pairs[i][0];
|
||||
results.add(new ScoredSuggestion(word, score));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to get the top k words from a list of word IDs.
|
||||
*/
|
||||
private List<ScoredSuggestion> getTopKFromWordIds(TIntArrayList wordIds, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// For small lists, avoid sorting
|
||||
if (wordIds.size() <= k) {
|
||||
List<ScoredSuggestion> results = new ArrayList<>(wordIds.size());
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int wordId : ids) {
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
results.add(new ScoredSuggestion(words.get(wordId), wordScores.get(wordId)));
|
||||
}
|
||||
}
|
||||
results.sort((a, b) -> Integer.compare(b.getScore(), a.getScore()));
|
||||
return results;
|
||||
}
|
||||
|
||||
// For larger lists, use an array-based approach for better performance
|
||||
// Find top k without full sorting
|
||||
int[] topScores = new int[k];
|
||||
int[] topWordIds = new int[k];
|
||||
int[] ids = wordIds.toArray();
|
||||
|
||||
// Initialize with first k elements
|
||||
int filledCount = Math.min(k, ids.length);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
topWordIds[i] = wordId;
|
||||
topScores[i] = wordScores.get(wordId);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort initial elements
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
for (int j = i + 1; j < filledCount; j++) {
|
||||
if (topScores[j] > topScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[i];
|
||||
topScores[i] = topScores[j];
|
||||
topScores[j] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[i];
|
||||
topWordIds[i] = topWordIds[j];
|
||||
topWordIds[j] = tempId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining elements
|
||||
int minScore = filledCount > 0 ? topScores[filledCount - 1] : Integer.MIN_VALUE;
|
||||
|
||||
for (int i = k; i < ids.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
int score = wordScores.get(wordId);
|
||||
|
||||
if (score > minScore) {
|
||||
// Replace the lowest element
|
||||
topScores[filledCount - 1] = score;
|
||||
topWordIds[filledCount - 1] = wordId;
|
||||
|
||||
// Bubble up the new element
|
||||
for (int j = filledCount - 1; j > 0; j--) {
|
||||
if (topScores[j] > topScores[j - 1]) {
|
||||
// Swap scores
|
||||
int tempScore = topScores[j];
|
||||
topScores[j] = topScores[j - 1];
|
||||
topScores[j - 1] = tempScore;
|
||||
|
||||
// Swap word IDs
|
||||
int tempId = topWordIds[j];
|
||||
topWordIds[j] = topWordIds[j - 1];
|
||||
topWordIds[j - 1] = tempId;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Update min score
|
||||
minScore = topScores[filledCount - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list
|
||||
List<ScoredSuggestion> results = new ArrayList<>(filledCount);
|
||||
for (int i = 0; i < filledCount; i++) {
|
||||
results.add(new ScoredSuggestion(words.get(topWordIds[i]), topScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use binary search on sorted word segments to efficiently find matches.
|
||||
*/
|
||||
private List<ScoredSuggestion> findByBinarySearchPrefix(String prefix, int k) {
|
||||
// If we have a lot of words, use an optimized segment approach
|
||||
if (words.size() > 1000) {
|
||||
// Divide words into segments for better locality
|
||||
int segmentSize = 1000;
|
||||
int numSegments = (words.size() + segmentSize - 1) / segmentSize;
|
||||
|
||||
// Find matches using binary search within each segment
|
||||
List<WordScorePair> allMatches = new ArrayList<>();
|
||||
for (int segment = 0; segment < numSegments; segment++) {
|
||||
int start = segment * segmentSize;
|
||||
int end = Math.min(start + segmentSize, words.size());
|
||||
|
||||
// Binary search for first potential match
|
||||
int pos = Collections.binarySearch(
|
||||
words.subList(start, end),
|
||||
prefix,
|
||||
(a, b) -> a.compareTo(b)
|
||||
);
|
||||
|
||||
if (pos < 0) {
|
||||
pos = -pos - 1;
|
||||
}
|
||||
|
||||
// Collect all matches
|
||||
for (int i = start + pos; i < end && i < words.size(); i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
allMatches.add(new WordScorePair(word, wordScores.get(i)));
|
||||
} else if (word.compareTo(prefix) > 0) {
|
||||
break; // Past potential matches
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score and take top k
|
||||
allMatches.sort((a, b) -> Integer.compare(b.score, a.score));
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, allMatches.size()));
|
||||
for (int i = 0; i < Math.min(k, allMatches.size()); i++) {
|
||||
WordScorePair pair = allMatches.get(i);
|
||||
results.add(new ScoredSuggestion(pair.word, pair.score));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
// Fallback for small dictionaries - linear scan but optimized
|
||||
return simpleSearchFallback(prefix, k);
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimized linear scan - only used for small dictionaries.
|
||||
*/
|
||||
private List<ScoredSuggestion> simpleSearchFallback(String prefix, int k) {
|
||||
// Use primitive arrays for better cache locality
|
||||
int[] matchScores = new int[Math.min(words.size(), 100)]; // Assume we won't find more than 100 matches
|
||||
String[] matchWords = new String[matchScores.length];
|
||||
int matchCount = 0;
|
||||
|
||||
for (int i = 0; i < words.size() && matchCount < matchScores.length; i++) {
|
||||
String word = words.get(i);
|
||||
if (word.startsWith(prefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(i);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort matches by score (in-place for small arrays)
|
||||
for (int i = 0; i < matchCount; i++) {
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[i]) {
|
||||
// Swap scores
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[j];
|
||||
matchScores[j] = tempScore;
|
||||
|
||||
// Swap words
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[j];
|
||||
matchWords[j] = tempWord;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create results
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get top k words from candidate IDs, filtering by the full prefix.
|
||||
*/
|
||||
private List<ScoredSuggestion> getFilteredTopKFromWordIds(TIntArrayList wordIds, String fullPrefix, int k) {
|
||||
if (wordIds == null || wordIds.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Make primitive arrays for better performance
|
||||
String[] matchWords = new String[Math.min(wordIds.size(), 1000)];
|
||||
int[] matchScores = new int[matchWords.length];
|
||||
int matchCount = 0;
|
||||
|
||||
int[] ids = wordIds.toArray();
|
||||
for (int i = 0; i < ids.length && matchCount < matchWords.length; i++) {
|
||||
int wordId = ids[i];
|
||||
if (wordId >= 0 && wordId < words.size()) {
|
||||
String word = words.get(wordId);
|
||||
if (word.startsWith(fullPrefix)) {
|
||||
matchWords[matchCount] = word;
|
||||
matchScores[matchCount] = wordScores.get(wordId);
|
||||
matchCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by score (efficient insertion sort for small k)
|
||||
for (int i = 0; i < Math.min(matchCount, k); i++) {
|
||||
int maxPos = i;
|
||||
for (int j = i + 1; j < matchCount; j++) {
|
||||
if (matchScores[j] > matchScores[maxPos]) {
|
||||
maxPos = j;
|
||||
}
|
||||
}
|
||||
if (maxPos != i) {
|
||||
// Swap
|
||||
int tempScore = matchScores[i];
|
||||
matchScores[i] = matchScores[maxPos];
|
||||
matchScores[maxPos] = tempScore;
|
||||
|
||||
String tempWord = matchWords[i];
|
||||
matchWords[i] = matchWords[maxPos];
|
||||
matchWords[maxPos] = tempWord;
|
||||
}
|
||||
}
|
||||
|
||||
// Create result list (only up to k elements)
|
||||
List<ScoredSuggestion> results = new ArrayList<>(Math.min(k, matchCount));
|
||||
for (int i = 0; i < Math.min(k, matchCount); i++) {
|
||||
results.add(new ScoredSuggestion(matchWords[i], matchScores[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class representing a suggested completion.
|
||||
*/
|
||||
public static class ScoredSuggestion {
|
||||
private final String word;
|
||||
private final int score;
|
||||
|
||||
public ScoredSuggestion(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public int getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return word + " (" + score + ")";
|
||||
}
|
||||
}
|
||||
}
|
@@ -4,23 +4,24 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.functions.math.dict.SpellChecker;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class Suggestions {
|
||||
private PatriciaTrie<String> suggestionsTrie = null;
|
||||
private PrefixSearchStructure searchStructure = null;
|
||||
private TermFrequencyDict termFrequencyDict = null;
|
||||
private volatile boolean ready = false;
|
||||
private final SpellChecker spellChecker;
|
||||
@@ -37,39 +38,40 @@ public class Suggestions {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
Thread.ofPlatform().start(() -> {
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
searchStructure = loadSuggestions(suggestionsFile);
|
||||
termFrequencyDict = dict;
|
||||
ready = true;
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
logger.info("Loaded {} suggestions", searchStructure.size());
|
||||
});
|
||||
}
|
||||
|
||||
private static PatriciaTrie<String> loadSuggestions(Path file) {
|
||||
private static PrefixSearchStructure loadSuggestions(Path file) {
|
||||
PrefixSearchStructure ret = new PrefixSearchStructure();
|
||||
|
||||
if (!Files.exists(file)) {
|
||||
logger.error("Suggestions file {} absent, loading empty suggestions db", file);
|
||||
return new PatriciaTrie<>();
|
||||
return ret;
|
||||
}
|
||||
try (var lines = Files.lines(file)) {
|
||||
var ret = new PatriciaTrie<String>();
|
||||
|
||||
lines.filter(suggestionPattern.asPredicate())
|
||||
.filter(line -> line.length()<32)
|
||||
.map(String::toLowerCase)
|
||||
.forEach(w -> ret.put(w, w));
|
||||
|
||||
// Add special keywords to the suggestions
|
||||
for (var feature : HtmlFeature.values()) {
|
||||
String keyword = feature.getKeyword();
|
||||
|
||||
ret.put(keyword, keyword);
|
||||
ret.put("-" + keyword, "-" + keyword);
|
||||
try (var scanner = new Scanner(new GZIPInputStream(new BufferedInputStream(Files.newInputStream(file, StandardOpenOption.READ))))) {
|
||||
while (scanner.hasNextLine()) {
|
||||
String line = scanner.nextLine();
|
||||
String[] parts = StringUtils.split(line, " ", 2);
|
||||
if (parts.length != 2) {
|
||||
logger.warn("Invalid suggestion line: {}", line);
|
||||
continue;
|
||||
}
|
||||
int cnt = Integer.parseInt(parts[0]);
|
||||
if (cnt > 1) {
|
||||
String word = parts[1];
|
||||
ret.insert(word, cnt);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to load suggestions file", ex);
|
||||
return new PatriciaTrie<>();
|
||||
return new PrefixSearchStructure();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,96 +85,24 @@ public class Suggestions {
|
||||
|
||||
searchWord = StringUtils.stripStart(searchWord.toLowerCase(), " ");
|
||||
|
||||
return Stream.of(
|
||||
new SuggestionStream("", getSuggestionsForKeyword(count, searchWord)),
|
||||
suggestionsForLastWord(count, searchWord),
|
||||
spellCheckStream(searchWord)
|
||||
)
|
||||
.flatMap(SuggestionsStreamable::stream)
|
||||
.limit(count)
|
||||
.collect(Collectors.toList());
|
||||
return getSuggestionsForKeyword(count, searchWord);
|
||||
}
|
||||
|
||||
private SuggestionsStreamable suggestionsForLastWord(int count, String searchWord) {
|
||||
int sp = searchWord.lastIndexOf(' ');
|
||||
|
||||
if (sp < 0) {
|
||||
return Stream::empty;
|
||||
}
|
||||
|
||||
String prefixString = searchWord.substring(0, sp+1);
|
||||
String suggestString = searchWord.substring(sp+1);
|
||||
|
||||
return new SuggestionStream(prefixString, getSuggestionsForKeyword(count, suggestString));
|
||||
|
||||
}
|
||||
|
||||
private SuggestionsStreamable spellCheckStream(String word) {
|
||||
int start = word.lastIndexOf(' ');
|
||||
String prefix;
|
||||
String corrWord;
|
||||
|
||||
if (start < 0) {
|
||||
corrWord = word;
|
||||
prefix = "";
|
||||
}
|
||||
else {
|
||||
prefix = word.substring(0, start + 1);
|
||||
corrWord = word.substring(start + 1);
|
||||
}
|
||||
|
||||
if (corrWord.length() >= MIN_SUGGEST_LENGTH) {
|
||||
Supplier<Stream<String>> suggestionsLazyEval = () -> spellChecker.correct(corrWord).stream();
|
||||
return new SuggestionStream(prefix, Stream.of(suggestionsLazyEval).flatMap(Supplier::get));
|
||||
}
|
||||
else {
|
||||
return Stream::empty;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
public List<String> getSuggestionsForKeyword(int count, String prefix) {
|
||||
if (!ready)
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
|
||||
if (prefix.length() < MIN_SUGGEST_LENGTH) {
|
||||
return Stream.empty();
|
||||
return List.of();
|
||||
}
|
||||
|
||||
var start = suggestionsTrie.select(prefix);
|
||||
|
||||
if (start == null) {
|
||||
return Stream.empty();
|
||||
var results = searchStructure.getTopCompletions(prefix, count);
|
||||
List<String> ret = new ArrayList<>(count);
|
||||
for (var result : results) {
|
||||
ret.add(result.getWord());
|
||||
}
|
||||
|
||||
if (!start.getKey().startsWith(prefix)) {
|
||||
return Stream.empty();
|
||||
}
|
||||
|
||||
SuggestionsValueCalculator sv = new SuggestionsValueCalculator();
|
||||
|
||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||
.takeWhile(s -> s.startsWith(prefix))
|
||||
.limit(256)
|
||||
.sorted(Comparator.comparing(sv::get).thenComparing(String::length).thenComparing(Comparator.naturalOrder()))
|
||||
.limit(count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private record SuggestionStream(String prefix, Stream<String> suggestionStream) implements SuggestionsStreamable {
|
||||
public Stream<String> stream() {
|
||||
return suggestionStream.map(s -> prefix + s);
|
||||
}
|
||||
}
|
||||
|
||||
interface SuggestionsStreamable { Stream<String> stream(); }
|
||||
|
||||
private class SuggestionsValueCalculator {
|
||||
|
||||
private final Map<String, Long> hashCache = new HashMap<>(512);
|
||||
|
||||
public int get(String s) {
|
||||
long hash = hashCache.computeIfAbsent(s, TermFrequencyDict::getStringHash);
|
||||
return -termFrequencyDict.getTermFreqHash(hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -59,9 +59,9 @@ public class ControlMain extends MainClass {
|
||||
download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt"));
|
||||
}
|
||||
|
||||
Path suggestionsFile = dataPath.resolve("suggestions.txt");
|
||||
Path suggestionsFile = dataPath.resolve("suggestions2.txt.gz");
|
||||
if (!Files.exists(suggestionsFile)) {
|
||||
downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz"));
|
||||
download(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions2.txt.gz"));
|
||||
}
|
||||
|
||||
Path asnRawData = dataPath.resolve("asn-data-raw-table");
|
||||
|
@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
|
||||
<tr>
|
||||
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-s">Small</label></td>
|
||||
<td>1000 Domains. About 2 GB. </td>
|
||||
<td>1000 Domains. About 1 GB. </td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-m">Medium</label></td>
|
||||
<td>2000 Domains. About 6 GB. Recommended.</td>
|
||||
<td>2000 Domains. About 2 GB. Recommended.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-l">Large</label></td>
|
||||
<td>5000 Domains. About 20 GB.</td>
|
||||
<td>5000 Domains. About 7 GB.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-xl">Huge</label></td>
|
||||
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
|
||||
<td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
|
||||
Expect hours of processing time. </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
@@ -44,6 +44,7 @@ dependencies {
|
||||
implementation libs.guice
|
||||
implementation libs.fastutil
|
||||
implementation libs.trove
|
||||
implementation libs.bundles.httpcomponents
|
||||
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
@@ -10,7 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@@ -121,11 +121,12 @@ public class IntegrationTest {
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
new DomainCookies(),
|
||||
"text/html", 200,
|
||||
"""
|
||||
<html>
|
||||
|
@@ -179,8 +179,9 @@ dependencyResolutionManagement {
|
||||
|
||||
library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5')
|
||||
|
||||
library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15')
|
||||
library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13')
|
||||
|
||||
library('httpcore', 'org.apache.httpcomponents.core5','httpcore5').version('5.3.4')
|
||||
library('httpclient', 'org.apache.httpcomponents.client5','httpclient5').version('5.4.3')
|
||||
library('commons.net', 'commons-net','commons-net').version('3.9.0')
|
||||
library('commons.lang3', 'org.apache.commons','commons-lang3').version('3.12.0')
|
||||
library('commons.compress','org.apache.commons','commons-compress').version('1.25.0')
|
||||
@@ -255,7 +256,7 @@ dependencyResolutionManagement {
|
||||
bundle('grpc', ['protobuf', 'grpc-stub', 'grpc-protobuf', 'grpc-netty'])
|
||||
bundle('protobuf', ['protobuf', 'javax.annotation'])
|
||||
bundle('gson', ['gson', 'gson-type-adapter'])
|
||||
bundle('httpcomponents', ['httpcomponents.core', 'httpcomponents.client'])
|
||||
bundle('httpcomponents', ['httpcore', 'httpclient'])
|
||||
bundle('parquet', ['parquet-column', 'parquet-hadoop'])
|
||||
bundle('junit', ['junit.jupiter', 'junit.jupiter.engine'])
|
||||
bundle('flyway', ['flyway.core', 'flyway.mysql'])
|
||||
|
Reference in New Issue
Block a user