mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
6 Commits
deploy-019
...
deploy-019
Author | SHA1 | Date | |
---|---|---|---|
|
8a944cf4c6 | ||
|
1c128e6d82 | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 | ||
|
f59a9eb025 |
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||||
|
|
||||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||||
for (var instance : journal.pages()) {
|
for (var instance : journal.pages()) {
|
||||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||||
{
|
{
|
||||||
|
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
|||||||
"",
|
"",
|
||||||
body.getBytes(StandardCharsets.UTF_8),
|
body.getBytes(StandardCharsets.UTF_8),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
|
|||||||
float minFontWeight = Integer.MAX_VALUE;
|
float minFontWeight = Integer.MAX_VALUE;
|
||||||
for (var word : line)
|
for (var word : line)
|
||||||
{
|
{
|
||||||
int i = 0;
|
|
||||||
for (var textPosition : word.getTextPositions())
|
for (var textPosition : word.getTextPositions())
|
||||||
{
|
{
|
||||||
if (word.text.charAt(i++) == ' ') {
|
// Skip empty text positions as they may have a different font
|
||||||
continue;
|
if (word.text.isBlank()) continue;
|
||||||
}
|
|
||||||
var font = textPosition.getFont();
|
var font = textPosition.getFont();
|
||||||
if (font == null) continue;
|
if (font == null) continue;
|
||||||
var descriptor = font.getFontDescriptor();
|
var descriptor = font.getFontDescriptor();
|
||||||
|
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
|||||||
"",
|
"",
|
||||||
readClassPathFile(p.toString()).getBytes(),
|
readClassPathFile(p.toString()).getBytes(),
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
|
@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
|
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
|||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||||
|
|
||||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
for (var header : headers) {
|
for (var header : headers) {
|
||||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
|||||||
if (headerCapitalized.equals("Content-Encoding"))
|
if (headerCapitalized.equals("Content-Encoding"))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||||
// to reflect the actual size of the response body. We'll do this at the end.
|
// to reflect the actual size of the response body. We'll do this at the end.
|
||||||
if (headerCapitalized.equals("Content-Length"))
|
if (headerCapitalized.equals("Content-Length"))
|
||||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
|||||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||||
joiner.add("Content-Length: " + responseSize);
|
joiner.add("Content-Length: " + responseSize);
|
||||||
|
|
||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
Instant date = Instant.now();
|
Instant requestDate = Instant.now();
|
||||||
|
|
||||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||||
InputStream inputStream = inputBuffer.read()) {
|
InputStream inputStream = inputBuffer.read()) {
|
||||||
|
|
||||||
|
Instant responseDate = Instant.now();
|
||||||
|
|
||||||
cookies.updateCookieStore(response);
|
cookies.updateCookieStore(response);
|
||||||
|
|
||||||
// Build and write the request
|
// Build and write the request
|
||||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||||
.blockDigest(requestDigestBuilder.build())
|
.blockDigest(requestDigestBuilder.build())
|
||||||
.date(date)
|
.date(requestDate)
|
||||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
response.addHeader("X-Has-Cookies", 1);
|
response.addHeader("X-Has-Cookies", 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||||
|
Duration.between(requestDate, responseDate),
|
||||||
|
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
|
|
||||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
.blockDigest(responseDigestBuilder.build())
|
.blockDigest(responseDigestBuilder.build())
|
||||||
.date(date)
|
.date(responseDate)
|
||||||
.concurrentTo(warcRequest.id())
|
.concurrentTo(warcRequest.id())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
writer.write(warcResponse);
|
writer.write(warcResponse);
|
||||||
|
|
||||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
&& inputBuffer.size() < 2048
|
&& inputBuffer.size() < 2048
|
||||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
{
|
{
|
||||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||||
requestUri,
|
requestUri,
|
||||||
Duration.between(date, Instant.now()).getSeconds(),
|
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||||
inputBuffer.size()
|
inputBuffer.size()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
nextRecord.body,
|
nextRecord.body,
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
|
-1,
|
||||||
lastModified,
|
lastModified,
|
||||||
etag));
|
etag));
|
||||||
}
|
}
|
||||||
|
@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
nextRecord.body(),
|
nextRecord.body(),
|
||||||
// this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.cookies(),
|
nextRecord.cookies(),
|
||||||
|
nextRecord.requestTimeMs(),
|
||||||
null,
|
null,
|
||||||
null));
|
null));
|
||||||
}
|
}
|
||||||
|
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String crawlerStatus;
|
public String crawlerStatus;
|
||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
|
public int requestTimeMs;
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public String headers;
|
public String headers;
|
||||||
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String lastModifiedMaybe;
|
public String lastModifiedMaybe;
|
||||||
public String etagMaybe;
|
public String etagMaybe;
|
||||||
|
|
||||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
|
||||||
this.crawlId = crawlId;
|
this.crawlId = crawlId;
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.contentType = contentType;
|
this.contentType = contentType;
|
||||||
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
||||||
this.hasCookies = hasCookies;
|
this.hasCookies = hasCookies;
|
||||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||||
|
this.requestTimeMs = requestTimeMs;
|
||||||
this.etagMaybe = etagMaybe;
|
this.etagMaybe = etagMaybe;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
private byte[] documentBodyBytes = new byte[0];
|
private byte[] documentBodyBytes = new byte[0];
|
||||||
private String recrawlState;
|
private String recrawlState;
|
||||||
private Boolean hasCookies;
|
private Boolean hasCookies;
|
||||||
|
private int requestTimeMs;
|
||||||
private String lastModifiedMaybe;
|
private String lastModifiedMaybe;
|
||||||
private String etagMaybe;
|
private String etagMaybe;
|
||||||
|
|
||||||
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
|
||||||
|
this.requestTimeMs = requestTimeMs;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public CrawledDocument build() {
|
public CrawledDocument build() {
|
||||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
|
|||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||||
import nu.marginalia.slop.column.primitive.ByteColumn;
|
import nu.marginalia.slop.column.primitive.ByteColumn;
|
||||||
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.primitive.ShortColumn;
|
import nu.marginalia.slop.column.primitive.ShortColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumn;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
long timestamp,
|
long timestamp,
|
||||||
String contentType,
|
String contentType,
|
||||||
byte[] body,
|
byte[] body,
|
||||||
|
int requestTimeMs,
|
||||||
String headers)
|
String headers)
|
||||||
{
|
{
|
||||||
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
||||||
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
||||||
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
||||||
|
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
|
||||||
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||||
|
|
||||||
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
||||||
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
parquetRecord.timestamp.toEpochMilli(),
|
parquetRecord.timestamp.toEpochMilli(),
|
||||||
parquetRecord.contentType,
|
parquetRecord.contentType,
|
||||||
parquetRecord.body,
|
parquetRecord.body,
|
||||||
|
-1,
|
||||||
parquetRecord.headers
|
parquetRecord.headers
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
"x-marginalia/advisory;state=redirect",
|
"x-marginalia/advisory;state=redirect",
|
||||||
new byte[0],
|
new byte[0],
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
"x-marginalia/advisory;state=error",
|
"x-marginalia/advisory;state=error",
|
||||||
errorStatus.getBytes(),
|
errorStatus.getBytes(),
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
date.toEpochMilli(),
|
date.toEpochMilli(),
|
||||||
errorStatus,
|
errorStatus,
|
||||||
new byte[0],
|
new byte[0],
|
||||||
|
-1,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Writer timestampColumnWriter;
|
private final LongColumn.Writer timestampColumnWriter;
|
||||||
private final EnumColumn.Writer contentTypeColumnWriter;
|
private final EnumColumn.Writer contentTypeColumnWriter;
|
||||||
private final ByteArrayColumn.Writer bodyColumnWriter;
|
private final ByteArrayColumn.Writer bodyColumnWriter;
|
||||||
|
private final ShortColumn.Writer requestTimeColumnWriter;
|
||||||
private final StringColumn.Writer headerColumnWriter;
|
private final StringColumn.Writer headerColumnWriter;
|
||||||
|
|
||||||
public Writer(Path path) throws IOException {
|
public Writer(Path path) throws IOException {
|
||||||
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnWriter = timestampColumn.create(this);
|
timestampColumnWriter = timestampColumn.create(this);
|
||||||
contentTypeColumnWriter = contentTypeColumn.create(this);
|
contentTypeColumnWriter = contentTypeColumn.create(this);
|
||||||
bodyColumnWriter = bodyColumn.create(this);
|
bodyColumnWriter = bodyColumn.create(this);
|
||||||
|
requestTimeColumnWriter = requestTimeColumn.create(this);
|
||||||
headerColumnWriter = headerColumn.create(this);
|
headerColumnWriter = headerColumn.create(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnWriter.put(record.timestamp);
|
timestampColumnWriter.put(record.timestamp);
|
||||||
contentTypeColumnWriter.put(record.contentType);
|
contentTypeColumnWriter.put(record.contentType);
|
||||||
bodyColumnWriter.put(record.body);
|
bodyColumnWriter.put(record.body);
|
||||||
|
requestTimeColumnWriter.put((short) record.requestTimeMs);
|
||||||
headerColumnWriter.put(record.headers);
|
headerColumnWriter.put(record.headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
|
|
||||||
String headersStr;
|
String headersStr;
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
|
int requestTimeMs = -1;
|
||||||
for (var header : headers) {
|
for (var header : headers) {
|
||||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||||
hasCookies = true;
|
hasCookies = true;
|
||||||
}
|
}
|
||||||
|
if (header.getName().equals("X-Marginalia-Response-Time")) {
|
||||||
|
try {
|
||||||
|
requestTimeMs = Integer.parseInt(header.getValue());
|
||||||
|
}
|
||||||
|
catch (NumberFormatException ex) {
|
||||||
|
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||||
}
|
}
|
||||||
headersStr = headersStrBuilder.toString();
|
headersStr = headersStrBuilder.toString();
|
||||||
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
response.date().toEpochMilli(),
|
response.date().toEpochMilli(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
|
requestTimeMs,
|
||||||
headersStr
|
headersStr
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Reader timestampColumnReader;
|
private final LongColumn.Reader timestampColumnReader;
|
||||||
private final EnumColumn.Reader contentTypeColumnReader;
|
private final EnumColumn.Reader contentTypeColumnReader;
|
||||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||||
|
private final ShortColumn.Reader requestTimeColumnReader;
|
||||||
private final StringColumn.Reader headerColumnReader;
|
private final StringColumn.Reader headerColumnReader;
|
||||||
|
|
||||||
public Reader(Path path) throws IOException {
|
public Reader(Path path) throws IOException {
|
||||||
@@ -474,6 +496,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnReader = timestampColumn.open(this);
|
timestampColumnReader = timestampColumn.open(this);
|
||||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||||
bodyColumnReader = bodyColumn.open(this);
|
bodyColumnReader = bodyColumn.open(this);
|
||||||
|
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||||
headerColumnReader = headerColumn.open(this);
|
headerColumnReader = headerColumn.open(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -487,6 +510,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnReader.get(),
|
timestampColumnReader.get(),
|
||||||
contentTypeColumnReader.get(),
|
contentTypeColumnReader.get(),
|
||||||
bodyColumnReader.get(),
|
bodyColumnReader.get(),
|
||||||
|
requestTimeColumnReader.get(),
|
||||||
headerColumnReader.get()
|
headerColumnReader.get()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -506,6 +530,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
private final LongColumn.Reader timestampColumnReader;
|
private final LongColumn.Reader timestampColumnReader;
|
||||||
private final EnumColumn.Reader contentTypeColumnReader;
|
private final EnumColumn.Reader contentTypeColumnReader;
|
||||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||||
|
private final ShortColumn.Reader requestTimeColumnReader;
|
||||||
private final StringColumn.Reader headerColumnReader;
|
private final StringColumn.Reader headerColumnReader;
|
||||||
|
|
||||||
private SlopCrawlDataRecord next = null;
|
private SlopCrawlDataRecord next = null;
|
||||||
@@ -521,6 +546,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
timestampColumnReader = timestampColumn.open(this);
|
timestampColumnReader = timestampColumn.open(this);
|
||||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||||
bodyColumnReader = bodyColumn.open(this);
|
bodyColumnReader = bodyColumn.open(this);
|
||||||
|
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||||
headerColumnReader = headerColumn.open(this);
|
headerColumnReader = headerColumn.open(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -548,6 +574,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
boolean cookies = cookiesColumnReader.get() == 1;
|
boolean cookies = cookiesColumnReader.get() == 1;
|
||||||
int status = statusColumnReader.get();
|
int status = statusColumnReader.get();
|
||||||
long timestamp = timestampColumnReader.get();
|
long timestamp = timestampColumnReader.get();
|
||||||
|
int requestTimeMs = requestTimeColumnReader.get();
|
||||||
String contentType = contentTypeColumnReader.get();
|
String contentType = contentTypeColumnReader.get();
|
||||||
|
|
||||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||||
@@ -555,7 +582,7 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
|
|
||||||
if (filter(url, status, contentType)) {
|
if (filter(url, status, contentType)) {
|
||||||
next = new SlopCrawlDataRecord(
|
next = new SlopCrawlDataRecord(
|
||||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
|
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
headers,
|
headers,
|
||||||
body,
|
body,
|
||||||
false,
|
false,
|
||||||
|
-1,
|
||||||
"",
|
"",
|
||||||
""
|
""
|
||||||
));
|
));
|
||||||
|
@@ -61,7 +61,7 @@ public class UrlDeduplicator {
|
|||||||
|
|
||||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||||
final var domain = details.getUrl().getDomain();
|
final var domain = details.getUrl().getDomain();
|
||||||
final String key = domain.getDomainKey();
|
final String key = domain.toString();
|
||||||
|
|
||||||
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
||||||
}
|
}
|
||||||
|
@@ -8,3 +8,4 @@
|
|||||||
2025-05-05: Deploy executor partition 4.
|
2025-05-05: Deploy executor partition 4.
|
||||||
2025-05-05: Deploy control.
|
2025-05-05: Deploy control.
|
||||||
2025-05-08: Deploy assistant.
|
2025-05-08: Deploy assistant.
|
||||||
|
2025-05-17: Redeploy all.
|
Reference in New Issue
Block a user