mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
4 Commits
deploy-019
...
deploy-019
Author | SHA1 | Date | |
---|---|---|---|
|
1c128e6d82 | ||
|
4edc0d3267 | ||
|
890f521d0d | ||
|
b1814a30f7 |
@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
|
||||
|
||||
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(65536);
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
|
||||
for (var instance : journal.pages()) {
|
||||
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
|
||||
{
|
||||
|
@@ -53,6 +53,7 @@ public class SideloaderProcessing {
|
||||
"",
|
||||
body.getBytes(StandardCharsets.UTF_8),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
|
||||
float minFontWeight = Integer.MAX_VALUE;
|
||||
for (var word : line)
|
||||
{
|
||||
int i = 0;
|
||||
for (var textPosition : word.getTextPositions())
|
||||
{
|
||||
if (word.text.charAt(i++) == ' ') {
|
||||
continue;
|
||||
}
|
||||
// Skip empty text positions as they may have a different font
|
||||
if (word.text.isBlank()) continue;
|
||||
|
||||
var font = textPosition.getFont();
|
||||
if (font == null) continue;
|
||||
var descriptor = font.getFontDescriptor();
|
||||
|
@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
|
||||
"",
|
||||
readClassPathFile(p.toString()).getBytes(),
|
||||
false,
|
||||
-1,
|
||||
null,
|
||||
null
|
||||
);
|
||||
|
@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
|
||||
));
|
||||
}
|
||||
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
|
||||
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
|
||||
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
|
||||
}
|
||||
|
||||
|
@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(ClassicHttpResponse response, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), size);
|
||||
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
|
||||
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
|
||||
|
||||
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
|
||||
|
||||
static private String getHeadersAsString(Header[] headers, long responseSize) {
|
||||
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
for (var header : headers) {
|
||||
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
|
||||
if (headerCapitalized.equals("Content-Encoding"))
|
||||
continue;
|
||||
|
||||
|
||||
// Since we're transparently decoding gzip, we need to update the Content-Length header
|
||||
// to reflect the actual size of the response body. We'll do this at the end.
|
||||
if (headerCapitalized.equals("Content-Length"))
|
||||
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
|
||||
joiner.add(headerCapitalized + ": " + header.getValue());
|
||||
}
|
||||
|
||||
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
|
||||
joiner.add("Content-Length: " + responseSize);
|
||||
|
||||
return joiner.toString();
|
||||
|
@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
Instant date = Instant.now();
|
||||
Instant requestDate = Instant.now();
|
||||
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
|
||||
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
|
||||
InputStream inputStream = inputBuffer.read()) {
|
||||
|
||||
Instant responseDate = Instant.now();
|
||||
|
||||
cookies.updateCookieStore(response);
|
||||
|
||||
// Build and write the request
|
||||
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
|
||||
.blockDigest(requestDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(requestDate)
|
||||
.body(MediaType.HTTP_REQUEST, httpRequestString)
|
||||
.build();
|
||||
|
||||
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
response.addHeader("X-Has-Cookies", 1);
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
|
||||
Duration.between(requestDate, responseDate),
|
||||
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
|
||||
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.date(responseDate)
|
||||
.concurrentTo(warcRequest.id())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||
writer.write(warcResponse);
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||
requestUri,
|
||||
Duration.between(date, Instant.now()).getSeconds(),
|
||||
Duration.between(requestDate, Instant.now()).getSeconds(),
|
||||
inputBuffer.size()
|
||||
);
|
||||
|
||||
|
@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
nextRecord.body,
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies,
|
||||
-1,
|
||||
lastModified,
|
||||
etag));
|
||||
}
|
||||
|
@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
nextRecord.body(),
|
||||
// this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.cookies(),
|
||||
nextRecord.requestTimeMs(),
|
||||
null,
|
||||
null));
|
||||
}
|
||||
|
@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
|
||||
public String crawlerStatus;
|
||||
public String crawlerStatusDesc;
|
||||
public int requestTimeMs;
|
||||
|
||||
@Nullable
|
||||
public String headers;
|
||||
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
public String lastModifiedMaybe;
|
||||
public String etagMaybe;
|
||||
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
|
||||
this.crawlId = crawlId;
|
||||
this.url = url;
|
||||
this.contentType = contentType;
|
||||
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
|
||||
this.hasCookies = hasCookies;
|
||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
this.etagMaybe = etagMaybe;
|
||||
}
|
||||
|
||||
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
private byte[] documentBodyBytes = new byte[0];
|
||||
private String recrawlState;
|
||||
private Boolean hasCookies;
|
||||
private int requestTimeMs;
|
||||
private String lastModifiedMaybe;
|
||||
private String etagMaybe;
|
||||
|
||||
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
|
||||
this.requestTimeMs = requestTimeMs;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawledDocument build() {
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.slop.column.array.ByteArrayColumn;
|
||||
import nu.marginalia.slop.column.primitive.ByteColumn;
|
||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.primitive.ShortColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
long timestamp,
|
||||
String contentType,
|
||||
byte[] body,
|
||||
int requestTimeMs,
|
||||
String headers)
|
||||
{
|
||||
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private static final LongColumn timestampColumn = new LongColumn("timestamp");
|
||||
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
|
||||
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
|
||||
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
|
||||
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
|
||||
|
||||
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
|
||||
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
parquetRecord.timestamp.toEpochMilli(),
|
||||
parquetRecord.contentType,
|
||||
parquetRecord.body,
|
||||
-1,
|
||||
parquetRecord.headers
|
||||
);
|
||||
}
|
||||
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=redirect",
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes(),
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
date.toEpochMilli(),
|
||||
errorStatus,
|
||||
new byte[0],
|
||||
-1,
|
||||
""
|
||||
);
|
||||
}
|
||||
@@ -391,10 +398,20 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
String headersStr;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
int requestTimeMs = -1;
|
||||
for (var header : headers) {
|
||||
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
|
||||
hasCookies = true;
|
||||
}
|
||||
if (header.getName().equals("X-Marginalia-Response-Time")) {
|
||||
try {
|
||||
requestTimeMs = Integer.parseInt(header.getValue());
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
headersStrBuilder.add(header.getName() + ": " + header.getValue());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
@@ -409,6 +426,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
response.date().toEpochMilli(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
requestTimeMs,
|
||||
headersStr
|
||||
)
|
||||
);
|
||||
@@ -461,6 +479,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
public Reader(Path path) throws IOException {
|
||||
@@ -474,6 +493,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader = timestampColumn.open(this);
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
}
|
||||
|
||||
@@ -487,6 +507,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader.get(),
|
||||
contentTypeColumnReader.get(),
|
||||
bodyColumnReader.get(),
|
||||
requestTimeColumnReader.get(),
|
||||
headerColumnReader.get()
|
||||
);
|
||||
}
|
||||
@@ -506,6 +527,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
private final LongColumn.Reader timestampColumnReader;
|
||||
private final EnumColumn.Reader contentTypeColumnReader;
|
||||
private final ByteArrayColumn.Reader bodyColumnReader;
|
||||
private final ShortColumn.Reader requestTimeColumnReader;
|
||||
private final StringColumn.Reader headerColumnReader;
|
||||
|
||||
private SlopCrawlDataRecord next = null;
|
||||
@@ -521,6 +543,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
timestampColumnReader = timestampColumn.open(this);
|
||||
contentTypeColumnReader = contentTypeColumn.open(this);
|
||||
bodyColumnReader = bodyColumn.open(this);
|
||||
requestTimeColumnReader = requestTimeColumn.open(this);
|
||||
headerColumnReader = headerColumn.open(this);
|
||||
}
|
||||
|
||||
@@ -548,6 +571,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
boolean cookies = cookiesColumnReader.get() == 1;
|
||||
int status = statusColumnReader.get();
|
||||
long timestamp = timestampColumnReader.get();
|
||||
int requestTimeMs = requestTimeColumnReader.get();
|
||||
String contentType = contentTypeColumnReader.get();
|
||||
|
||||
LargeItem<byte[]> body = bodyColumnReader.getLarge();
|
||||
@@ -555,7 +579,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
|
||||
if (filter(url, status, contentType)) {
|
||||
next = new SlopCrawlDataRecord(
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
|
||||
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
||||
headers,
|
||||
body,
|
||||
false,
|
||||
-1,
|
||||
"",
|
||||
""
|
||||
));
|
||||
|
@@ -7,4 +7,5 @@
|
||||
2025-05-04: Deploy qs, search and api-services.
|
||||
2025-05-05: Deploy executor partition 4.
|
||||
2025-05-05: Deploy control.
|
||||
2025-05-08: Deploy assistant.
|
||||
2025-05-08: Deploy assistant.
|
||||
2025-05-17: Redeploy all.
|
Reference in New Issue
Block a user