1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

6 Commits

Author SHA1 Message Date
Viktor Lofgren
8a944cf4c6 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:07:41 +02:00
Viktor Lofgren
1c128e6d82 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:02:03 +02:00
Viktor Lofgren
4edc0d3267 (converter) Increase work buffer for converter
Conversion on index node  7 in production is crashing ostensibly because this buffer is too small.
2025-05-18 13:22:44 +02:00
Viktor Lofgren
890f521d0d (pdf) Fix crash for some bold lines 2025-05-18 13:05:05 +02:00
Viktor Lofgren
b1814a30f7 (deploy) Redeploy all services. 2025-05-17 13:11:51 +02:00
Viktor Lofgren
f59a9eb025 (legacy-search) Soften domain limit constraints in URL deduplication 2025-05-17 00:04:27 +02:00
14 changed files with 67 additions and 20 deletions

View File

@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
ByteBuffer workArea = ByteBuffer.allocate(65536); ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
for (var instance : journal.pages()) { for (var instance : journal.pages()) {
try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
{ {

View File

@@ -53,6 +53,7 @@ public class SideloaderProcessing {
"", "",
body.getBytes(StandardCharsets.UTF_8), body.getBytes(StandardCharsets.UTF_8),
false, false,
-1,
null, null,
null null
); );

View File

@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
float minFontWeight = Integer.MAX_VALUE; float minFontWeight = Integer.MAX_VALUE;
for (var word : line) for (var word : line)
{ {
int i = 0;
for (var textPosition : word.getTextPositions()) for (var textPosition : word.getTextPositions())
{ {
if (word.text.charAt(i++) == ' ') { // Skip empty text positions as they may have a different font
continue; if (word.text.isBlank()) continue;
}
var font = textPosition.getFont(); var font = textPosition.getFont();
if (font == null) continue; if (font == null) continue;
var descriptor = font.getFontDescriptor(); var descriptor = font.getFontDescriptor();

View File

@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
"", "",
readClassPathFile(p.toString()).getBytes(), readClassPathFile(p.toString()).getBytes(),
false, false,
-1,
null, null,
null null
); );

View File

@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
)); ));
} }
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception { public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null); var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL); return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
} }

View File

@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
import java.net.http.HttpHeaders; import java.net.http.HttpHeaders;
import java.net.http.HttpResponse; import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n"; return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
} }
static String getResponseHeader(ClassicHttpResponse response, long size) { static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
String headerString = getHeadersAsString(response.getHeaders(), size); String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n"; return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
} }
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
static private String getHeadersAsString(Header[] headers, long responseSize) { static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
StringJoiner joiner = new StringJoiner("\r\n"); StringJoiner joiner = new StringJoiner("\r\n");
for (var header : headers) { for (var header : headers) {
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
if (headerCapitalized.equals("Content-Encoding")) if (headerCapitalized.equals("Content-Encoding"))
continue; continue;
// Since we're transparently decoding gzip, we need to update the Content-Length header // Since we're transparently decoding gzip, we need to update the Content-Length header
// to reflect the actual size of the response body. We'll do this at the end. // to reflect the actual size of the response body. We'll do this at the end.
if (headerCapitalized.equals("Content-Length")) if (headerCapitalized.equals("Content-Length"))
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
joiner.add(headerCapitalized + ": " + header.getValue()); joiner.add(headerCapitalized + ": " + header.getValue());
} }
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
joiner.add("Content-Length: " + responseSize); joiner.add("Content-Length: " + responseSize);
return joiner.toString(); return joiner.toString();

View File

@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
Instant date = Instant.now(); Instant requestDate = Instant.now();
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence // Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length); Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout); try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) { InputStream inputStream = inputBuffer.read()) {
Instant responseDate = Instant.now();
cookies.updateCookieStore(response); cookies.updateCookieStore(response);
// Build and write the request // Build and write the request
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
WarcRequest warcRequest = new WarcRequest.Builder(requestUri) WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
.blockDigest(requestDigestBuilder.build()) .blockDigest(requestDigestBuilder.build())
.date(date) .date(requestDate)
.body(MediaType.HTTP_REQUEST, httpRequestString) .body(MediaType.HTTP_REQUEST, httpRequestString)
.build(); .build();
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
response.addHeader("X-Has-Cookies", 1); response.addHeader("X-Has-Cookies", 1);
} }
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8); byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
Duration.between(requestDate, responseDate),
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length); ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build()) .blockDigest(responseDigestBuilder.build())
.date(date) .date(responseDate)
.concurrentTo(warcRequest.id()) .concurrentTo(warcRequest.id())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcResponse); writer.write(warcResponse);
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0 if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
&& inputBuffer.size() < 2048 && inputBuffer.size() < 2048
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt && !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
{ {
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)", logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
requestUri, requestUri,
Duration.between(date, Instant.now()).getSeconds(), Duration.between(requestDate, Instant.now()).getSeconds(),
inputBuffer.size() inputBuffer.size()
); );

View File

@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.body, nextRecord.body,
// this field isn't actually used, maybe we can skip calculating it? // this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies, nextRecord.cookies,
-1,
lastModified, lastModified,
etag)); etag));
} }

View File

@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
nextRecord.body(), nextRecord.body(),
// this field isn't actually used, maybe we can skip calculating it? // this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies(), nextRecord.cookies(),
nextRecord.requestTimeMs(),
null, null,
null)); null));
} }

View File

@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String crawlerStatus; public String crawlerStatus;
public String crawlerStatusDesc; public String crawlerStatusDesc;
public int requestTimeMs;
@Nullable @Nullable
public String headers; public String headers;
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String lastModifiedMaybe; public String lastModifiedMaybe;
public String etagMaybe; public String etagMaybe;
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
this.crawlId = crawlId; this.crawlId = crawlId;
this.url = url; this.url = url;
this.contentType = contentType; this.contentType = contentType;
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {}); this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
this.hasCookies = hasCookies; this.hasCookies = hasCookies;
this.lastModifiedMaybe = lastModifiedMaybe; this.lastModifiedMaybe = lastModifiedMaybe;
this.requestTimeMs = requestTimeMs;
this.etagMaybe = etagMaybe; this.etagMaybe = etagMaybe;
} }
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
private byte[] documentBodyBytes = new byte[0]; private byte[] documentBodyBytes = new byte[0];
private String recrawlState; private String recrawlState;
private Boolean hasCookies; private Boolean hasCookies;
private int requestTimeMs;
private String lastModifiedMaybe; private String lastModifiedMaybe;
private String etagMaybe; private String etagMaybe;
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
return this; return this;
} }
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
this.requestTimeMs = requestTimeMs;
return this;
}
public CrawledDocument build() { public CrawledDocument build() {
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
} }
public String toString() { public String toString() {

View File

@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.slop.column.array.ByteArrayColumn; import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.primitive.ByteColumn; import nu.marginalia.slop.column.primitive.ByteColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn; import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.primitive.ShortColumn; import nu.marginalia.slop.column.primitive.ShortColumn;
import nu.marginalia.slop.column.string.EnumColumn; import nu.marginalia.slop.column.string.EnumColumn;
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
long timestamp, long timestamp,
String contentType, String contentType,
byte[] body, byte[] body,
int requestTimeMs,
String headers) String headers)
{ {
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD); private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
private static final LongColumn timestampColumn = new LongColumn("timestamp"); private static final LongColumn timestampColumn = new LongColumn("timestamp");
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8); private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD); private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD); private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) { public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
parquetRecord.timestamp.toEpochMilli(), parquetRecord.timestamp.toEpochMilli(),
parquetRecord.contentType, parquetRecord.contentType,
parquetRecord.body, parquetRecord.body,
-1,
parquetRecord.headers parquetRecord.headers
); );
} }
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(), date.toEpochMilli(),
"x-marginalia/advisory;state=redirect", "x-marginalia/advisory;state=redirect",
new byte[0], new byte[0],
-1,
"" ""
); );
} }
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(), date.toEpochMilli(),
"x-marginalia/advisory;state=error", "x-marginalia/advisory;state=error",
errorStatus.getBytes(), errorStatus.getBytes(),
-1,
"" ""
); );
} }
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(), date.toEpochMilli(),
errorStatus, errorStatus,
new byte[0], new byte[0],
-1,
"" ""
); );
} }
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Writer timestampColumnWriter; private final LongColumn.Writer timestampColumnWriter;
private final EnumColumn.Writer contentTypeColumnWriter; private final EnumColumn.Writer contentTypeColumnWriter;
private final ByteArrayColumn.Writer bodyColumnWriter; private final ByteArrayColumn.Writer bodyColumnWriter;
private final ShortColumn.Writer requestTimeColumnWriter;
private final StringColumn.Writer headerColumnWriter; private final StringColumn.Writer headerColumnWriter;
public Writer(Path path) throws IOException { public Writer(Path path) throws IOException {
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter = timestampColumn.create(this); timestampColumnWriter = timestampColumn.create(this);
contentTypeColumnWriter = contentTypeColumn.create(this); contentTypeColumnWriter = contentTypeColumn.create(this);
bodyColumnWriter = bodyColumn.create(this); bodyColumnWriter = bodyColumn.create(this);
requestTimeColumnWriter = requestTimeColumn.create(this);
headerColumnWriter = headerColumn.create(this); headerColumnWriter = headerColumn.create(this);
} }
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter.put(record.timestamp); timestampColumnWriter.put(record.timestamp);
contentTypeColumnWriter.put(record.contentType); contentTypeColumnWriter.put(record.contentType);
bodyColumnWriter.put(record.body); bodyColumnWriter.put(record.body);
requestTimeColumnWriter.put((short) record.requestTimeMs);
headerColumnWriter.put(record.headers); headerColumnWriter.put(record.headers);
} }
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
String headersStr; String headersStr;
StringJoiner headersStrBuilder = new StringJoiner("\n"); StringJoiner headersStrBuilder = new StringJoiner("\n");
int requestTimeMs = -1;
for (var header : headers) { for (var header : headers) {
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) { if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
hasCookies = true; hasCookies = true;
} }
if (header.getName().equals("X-Marginalia-Response-Time")) {
try {
requestTimeMs = Integer.parseInt(header.getValue());
}
catch (NumberFormatException ex) {
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
}
continue;
}
headersStrBuilder.add(header.getName() + ": " + header.getValue()); headersStrBuilder.add(header.getName() + ": " + header.getValue());
} }
headersStr = headersStrBuilder.toString(); headersStr = headersStrBuilder.toString();
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
response.date().toEpochMilli(), response.date().toEpochMilli(),
contentType, contentType,
bodyBytes, bodyBytes,
requestTimeMs,
headersStr headersStr
) )
); );
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader; private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader; private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader; private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader; private final StringColumn.Reader headerColumnReader;
public Reader(Path path) throws IOException { public Reader(Path path) throws IOException {
@@ -474,6 +496,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader = timestampColumn.open(this); timestampColumnReader = timestampColumn.open(this);
contentTypeColumnReader = contentTypeColumn.open(this); contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this); bodyColumnReader = bodyColumn.open(this);
requestTimeColumnReader = requestTimeColumn.open(this);
headerColumnReader = headerColumn.open(this); headerColumnReader = headerColumn.open(this);
} }
@@ -487,6 +510,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader.get(), timestampColumnReader.get(),
contentTypeColumnReader.get(), contentTypeColumnReader.get(),
bodyColumnReader.get(), bodyColumnReader.get(),
requestTimeColumnReader.get(),
headerColumnReader.get() headerColumnReader.get()
); );
} }
@@ -506,6 +530,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader; private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader; private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader; private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader; private final StringColumn.Reader headerColumnReader;
private SlopCrawlDataRecord next = null; private SlopCrawlDataRecord next = null;
@@ -521,6 +546,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader = timestampColumn.open(this); timestampColumnReader = timestampColumn.open(this);
contentTypeColumnReader = contentTypeColumn.open(this); contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this); bodyColumnReader = bodyColumn.open(this);
requestTimeColumnReader = requestTimeColumn.open(this);
headerColumnReader = headerColumn.open(this); headerColumnReader = headerColumn.open(this);
} }
@@ -548,6 +574,7 @@ public record SlopCrawlDataRecord(String domain,
boolean cookies = cookiesColumnReader.get() == 1; boolean cookies = cookiesColumnReader.get() == 1;
int status = statusColumnReader.get(); int status = statusColumnReader.get();
long timestamp = timestampColumnReader.get(); long timestamp = timestampColumnReader.get();
int requestTimeMs = requestTimeColumnReader.get();
String contentType = contentTypeColumnReader.get(); String contentType = contentTypeColumnReader.get();
LargeItem<byte[]> body = bodyColumnReader.getLarge(); LargeItem<byte[]> body = bodyColumnReader.getLarge();
@@ -555,7 +582,7 @@ public record SlopCrawlDataRecord(String domain,
if (filter(url, status, contentType)) { if (filter(url, status, contentType)) {
next = new SlopCrawlDataRecord( next = new SlopCrawlDataRecord(
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get() domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
); );
return true; return true;
} }

View File

@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
headers, headers,
body, body,
false, false,
-1,
"", "",
"" ""
)); ));

View File

@@ -61,7 +61,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) { private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain(); final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey(); final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey; return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
} }

View File

@@ -8,3 +8,4 @@
2025-05-05: Deploy executor partition 4. 2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control. 2025-05-05: Deploy control.
2025-05-08: Deploy assistant. 2025-05-08: Deploy assistant.
2025-05-17: Redeploy all.