1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

13 Commits

Author SHA1 Message Date
Viktor Lofgren
cc40e99fdc (crawler) Add a migration workaround so we can still open old slop crawl data with the new column added 2025-05-19 14:37:59 +02:00
Viktor Lofgren
8a944cf4c6 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:07:41 +02:00
Viktor Lofgren
1c128e6d82 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:02:03 +02:00
Viktor Lofgren
4edc0d3267 (converter) Increase work buffer for converter
Conversion on index node  7 in production is crashing ostensibly because this buffer is too small.
2025-05-18 13:22:44 +02:00
Viktor Lofgren
890f521d0d (pdf) Fix crash for some bold lines 2025-05-18 13:05:05 +02:00
Viktor Lofgren
b1814a30f7 (deploy) Redeploy all services. 2025-05-17 13:11:51 +02:00
Viktor Lofgren
f59a9eb025 (legacy-search) Soften domain limit constraints in URL deduplication 2025-05-17 00:04:27 +02:00
Viktor Lofgren
599534806b (search) Soften domain limit constraints in URL deduplication 2025-05-17 00:00:42 +02:00
Viktor Lofgren
7e8253dac7 (search) Clean up debug logging 2025-05-17 00:00:28 +02:00
Viktor Lofgren
97a6780ea3 (search) Add debug logging for specific query 2025-05-16 23:41:35 +02:00
Viktor Lofgren
eb634beec8 (search) Add debug logging for specific query 2025-05-16 23:34:03 +02:00
Viktor Lofgren
269ebd1654 Revert "(query) Add debug logging for specific query"
This reverts commit 39ce40bfeb.
2025-05-16 23:29:06 +02:00
Viktor Lofgren
39ce40bfeb (query) Add debug logging for specific query 2025-05-16 23:23:53 +02:00
17 changed files with 79 additions and 37 deletions

View File

@@ -112,14 +112,6 @@ public class EdgeDomain implements Serializable {
return topDomain;
}
public String getDomainKey() {
int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) {
return topDomain;
}
return topDomain.substring(0, cutPoint).toLowerCase();
}
/** If possible, try to provide an alias domain,
* i.e. a domain name that is very likely to link to this one
* */

View File

@@ -8,14 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class EdgeDomainTest {
@Test
public void testSkepdic() throws URISyntaxException {
var domain = new EdgeUrl("http://www.skepdic.com/astrology.html");
assertEquals("skepdic", domain.getDomain().getDomainKey());
var domain2 = new EdgeUrl("http://skepdic.com/astrology.html");
assertEquals("skepdic", domain2.getDomain().getDomainKey());
}
@Test
public void testHkDomain() throws URISyntaxException {
var domain = new EdgeUrl("http://l7072i3.l7c.net");

View File

@@ -84,7 +84,7 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
ByteBuffer workArea = ByteBuffer.allocate(65536);
ByteBuffer workArea = ByteBuffer.allocate(1024*1024*100);
for (var instance : journal.pages()) {
try (var slopTable = new SlopTable(instance.baseDir(), instance.page()))
{

View File

@@ -53,6 +53,7 @@ public class SideloaderProcessing {
"",
body.getBytes(StandardCharsets.UTF_8),
false,
-1,
null,
null
);

View File

@@ -2002,12 +2002,11 @@ public class HeadingAwarePDFTextStripper extends LegacyPDFStreamEngine
float minFontWeight = Integer.MAX_VALUE;
for (var word : line)
{
int i = 0;
for (var textPosition : word.getTextPositions())
{
if (word.text.charAt(i++) == ' ') {
continue;
}
// Skip empty text positions as they may have a different font
if (word.text.isBlank()) continue;
var font = textPosition.getFont();
if (font == null) continue;
var descriptor = font.getFontDescriptor();

View File

@@ -148,6 +148,7 @@ public class ConvertingIntegrationTest {
"",
readClassPathFile(p.toString()).getBytes(),
false,
-1,
null,
null
);

View File

@@ -50,7 +50,7 @@ class PdfDocumentProcessorPluginTest {
));
}
public AbstractDocumentProcessorPlugin.DetailsWithWords testPdfFile(byte[] pdfBytes) throws Exception {
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, null, null);
var doc = new CrawledDocument("test", "https://www.example.com/sample.pdf", "application/pdf", Instant.now().toString(), 200, "OK", "OK", "", pdfBytes, false, -1, null, null);
return plugin.createDetails(doc, new LinkTexts(), DocumentClass.NORMAL);
}

View File

@@ -10,6 +10,7 @@ import java.net.http.HttpClient;
import java.net.http.HttpHeaders;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.*;
import java.util.stream.Collectors;
@@ -90,8 +91,8 @@ public class WarcProtocolReconstructor {
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
static String getResponseHeader(ClassicHttpResponse response, long size) {
String headerString = getHeadersAsString(response.getHeaders(), size);
static String getResponseHeader(ClassicHttpResponse response, Duration responseDuration, long size) {
String headerString = getHeadersAsString(response.getHeaders(), responseDuration, size);
return response.getVersion().format() + " " + response.getCode() + " " + response.getReasonPhrase() + "\r\n" + headerString + "\r\n\r\n";
}
@@ -160,7 +161,7 @@ public class WarcProtocolReconstructor {
static private String getHeadersAsString(Header[] headers, long responseSize) {
static private String getHeadersAsString(Header[] headers, Duration responseDuration, long responseSize) {
StringJoiner joiner = new StringJoiner("\r\n");
for (var header : headers) {
@@ -176,6 +177,7 @@ public class WarcProtocolReconstructor {
if (headerCapitalized.equals("Content-Encoding"))
continue;
// Since we're transparently decoding gzip, we need to update the Content-Length header
// to reflect the actual size of the response body. We'll do this at the end.
if (headerCapitalized.equals("Content-Length"))
@@ -184,6 +186,7 @@ public class WarcProtocolReconstructor {
joiner.add(headerCapitalized + ": " + header.getValue());
}
joiner.add("X-Marginalia-Response-Time: " + responseDuration.toMillis());
joiner.add("Content-Length: " + responseSize);
return joiner.toString();

View File

@@ -93,7 +93,7 @@ public class WarcRecorder implements AutoCloseable {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
Instant date = Instant.now();
Instant requestDate = Instant.now();
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
Map<String, List<String>> extraHeaders = new HashMap<>(request.getHeaders().length);
@@ -108,6 +108,8 @@ public class WarcRecorder implements AutoCloseable {
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response, request, timeout);
InputStream inputStream = inputBuffer.read()) {
Instant responseDate = Instant.now();
cookies.updateCookieStore(response);
// Build and write the request
@@ -126,7 +128,7 @@ public class WarcRecorder implements AutoCloseable {
WarcRequest warcRequest = new WarcRequest.Builder(requestUri)
.blockDigest(requestDigestBuilder.build())
.date(date)
.date(requestDate)
.body(MediaType.HTTP_REQUEST, httpRequestString)
.build();
@@ -138,7 +140,9 @@ public class WarcRecorder implements AutoCloseable {
response.addHeader("X-Has-Cookies", 1);
}
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response,
Duration.between(requestDate, responseDate),
inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
@@ -169,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build())
.date(date)
.date(responseDate)
.concurrentTo(warcRequest.id())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
@@ -184,7 +188,7 @@ public class WarcRecorder implements AutoCloseable {
warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
writer.write(warcResponse);
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
if (Duration.between(requestDate, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
&& inputBuffer.size() < 2048
&& !requestUri.getPath().endsWith("robots.txt")) // don't bail on robots.txt
{
@@ -196,7 +200,7 @@ public class WarcRecorder implements AutoCloseable {
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
requestUri,
Duration.between(date, Instant.now()).getSeconds(),
Duration.between(requestDate, Instant.now()).getSeconds(),
inputBuffer.size()
);

View File

@@ -148,6 +148,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.body,
// this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies,
-1,
lastModified,
etag));
}

View File

@@ -166,6 +166,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
nextRecord.body(),
// this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies(),
nextRecord.requestTimeMs(),
null,
null));
}

View File

@@ -23,6 +23,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String crawlerStatus;
public String crawlerStatusDesc;
public int requestTimeMs;
@Nullable
public String headers;
@@ -82,7 +83,7 @@ public final class CrawledDocument implements SerializableCrawlData {
public String lastModifiedMaybe;
public String etagMaybe;
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, int requestTimeMs, String lastModifiedMaybe, String etagMaybe) {
this.crawlId = crawlId;
this.url = url;
this.contentType = contentType;
@@ -94,6 +95,7 @@ public final class CrawledDocument implements SerializableCrawlData {
this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {});
this.hasCookies = hasCookies;
this.lastModifiedMaybe = lastModifiedMaybe;
this.requestTimeMs = requestTimeMs;
this.etagMaybe = etagMaybe;
}
@@ -173,6 +175,7 @@ public final class CrawledDocument implements SerializableCrawlData {
private byte[] documentBodyBytes = new byte[0];
private String recrawlState;
private Boolean hasCookies;
private int requestTimeMs;
private String lastModifiedMaybe;
private String etagMaybe;
@@ -248,8 +251,13 @@ public final class CrawledDocument implements SerializableCrawlData {
return this;
}
public CrawledDocumentBuilder requestTimeMs(int requestTimeMs) {
this.requestTimeMs = requestTimeMs;
return this;
}
public CrawledDocument build() {
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.requestTimeMs, this.lastModifiedMaybe, this.etagMaybe);
}
public String toString() {

View File

@@ -9,6 +9,7 @@ import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.slop.column.array.ByteArrayColumn;
import nu.marginalia.slop.column.primitive.ByteColumn;
import nu.marginalia.slop.column.primitive.IntColumn;
import nu.marginalia.slop.column.primitive.LongColumn;
import nu.marginalia.slop.column.primitive.ShortColumn;
import nu.marginalia.slop.column.string.EnumColumn;
@@ -39,6 +40,7 @@ public record SlopCrawlDataRecord(String domain,
long timestamp,
String contentType,
byte[] body,
int requestTimeMs,
String headers)
{
private static final EnumColumn domainColumn = new EnumColumn("domain", StandardCharsets.UTF_8, StorageType.ZSTD);
@@ -49,6 +51,7 @@ public record SlopCrawlDataRecord(String domain,
private static final LongColumn timestampColumn = new LongColumn("timestamp");
private static final EnumColumn contentTypeColumn = new EnumColumn("contentType", StandardCharsets.UTF_8);
private static final ByteArrayColumn bodyColumn = new ByteArrayColumn("body", StorageType.ZSTD);
private static final ShortColumn requestTimeColumn = new ShortColumn("requestTimeMs");
private static final StringColumn headerColumn = new StringColumn("header", StandardCharsets.UTF_8, StorageType.ZSTD);
public SlopCrawlDataRecord(CrawledDocumentParquetRecord parquetRecord) {
@@ -60,6 +63,7 @@ public record SlopCrawlDataRecord(String domain,
parquetRecord.timestamp.toEpochMilli(),
parquetRecord.contentType,
parquetRecord.body,
-1,
parquetRecord.headers
);
}
@@ -74,6 +78,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
"x-marginalia/advisory;state=redirect",
new byte[0],
-1,
""
);
}
@@ -87,6 +92,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
"x-marginalia/advisory;state=error",
errorStatus.getBytes(),
-1,
""
);
}
@@ -100,6 +106,7 @@ public record SlopCrawlDataRecord(String domain,
date.toEpochMilli(),
errorStatus,
new byte[0],
-1,
""
);
}
@@ -321,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Writer timestampColumnWriter;
private final EnumColumn.Writer contentTypeColumnWriter;
private final ByteArrayColumn.Writer bodyColumnWriter;
private final ShortColumn.Writer requestTimeColumnWriter;
private final StringColumn.Writer headerColumnWriter;
public Writer(Path path) throws IOException {
@@ -334,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter = timestampColumn.create(this);
contentTypeColumnWriter = contentTypeColumn.create(this);
bodyColumnWriter = bodyColumn.create(this);
requestTimeColumnWriter = requestTimeColumn.create(this);
headerColumnWriter = headerColumn.create(this);
}
@@ -346,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter.put(record.timestamp);
contentTypeColumnWriter.put(record.contentType);
bodyColumnWriter.put(record.body);
requestTimeColumnWriter.put((short) record.requestTimeMs);
headerColumnWriter.put(record.headers);
}
@@ -391,10 +401,20 @@ public record SlopCrawlDataRecord(String domain,
String headersStr;
StringJoiner headersStrBuilder = new StringJoiner("\n");
int requestTimeMs = -1;
for (var header : headers) {
if (header.getName().equalsIgnoreCase("X-Cookies") && "1".equals(header.getValue())) {
hasCookies = true;
}
if (header.getName().equals("X-Marginalia-Response-Time")) {
try {
requestTimeMs = Integer.parseInt(header.getValue());
}
catch (NumberFormatException ex) {
logger.warn("Failed to parse X-Marginalia-Response-Time header: {}", header.getValue());
}
continue;
}
headersStrBuilder.add(header.getName() + ": " + header.getValue());
}
headersStr = headersStrBuilder.toString();
@@ -409,6 +429,7 @@ public record SlopCrawlDataRecord(String domain,
response.date().toEpochMilli(),
contentType,
bodyBytes,
requestTimeMs,
headersStr
)
);
@@ -461,6 +482,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader;
public Reader(Path path) throws IOException {
@@ -475,6 +497,17 @@ public record SlopCrawlDataRecord(String domain,
contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this);
headerColumnReader = headerColumn.open(this);
// FIXME: After 2025-06-XX, we can remove this migration workaround
ShortColumn.Reader timeColumnReader;
try {
timeColumnReader = requestTimeColumn.open(this);
}
catch (Exception ex) {
// Migration workaround
timeColumnReader = null;
}
requestTimeColumnReader = timeColumnReader;
}
public SlopCrawlDataRecord get() throws IOException {
@@ -487,6 +520,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader.get(),
contentTypeColumnReader.get(),
bodyColumnReader.get(),
requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
headerColumnReader.get()
);
}
@@ -506,6 +540,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Reader timestampColumnReader;
private final EnumColumn.Reader contentTypeColumnReader;
private final ByteArrayColumn.Reader bodyColumnReader;
private final ShortColumn.Reader requestTimeColumnReader;
private final StringColumn.Reader headerColumnReader;
private SlopCrawlDataRecord next = null;
@@ -521,6 +556,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader = timestampColumn.open(this);
contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this);
requestTimeColumnReader = requestTimeColumn.open(this);
headerColumnReader = headerColumn.open(this);
}
@@ -548,6 +584,7 @@ public record SlopCrawlDataRecord(String domain,
boolean cookies = cookiesColumnReader.get() == 1;
int status = statusColumnReader.get();
long timestamp = timestampColumnReader.get();
int requestTimeMs = requestTimeColumnReader.get();
String contentType = contentTypeColumnReader.get();
LargeItem<byte[]> body = bodyColumnReader.getLarge();
@@ -555,7 +592,7 @@ public record SlopCrawlDataRecord(String domain,
if (filter(url, status, contentType)) {
next = new SlopCrawlDataRecord(
domain, url, ip, cookies, status, timestamp, contentType, body.get(), headers.get()
domain, url, ip, cookies, status, timestamp, contentType, body.get(), requestTimeMs, headers.get()
);
return true;
}

View File

@@ -195,6 +195,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
headers,
body,
false,
-1,
"",
""
));

View File

@@ -61,7 +61,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}

View File

@@ -25,6 +25,7 @@ public class UrlDeduplicator {
}
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
@@ -61,7 +62,7 @@ public class UrlDeduplicator {
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
final String key = domain.toString();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}

View File

@@ -7,4 +7,5 @@
2025-05-04: Deploy qs, search and api-services.
2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control.
2025-05-08: Deploy assistant.
2025-05-08: Deploy assistant.
2025-05-17: Redeploy all.