1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

2 Commits

Author SHA1 Message Date
Viktor Lofgren
cc40e99fdc (crawler) Add a migration workaround so we can still open old slop crawl data with the new column added 2025-05-19 14:37:59 +02:00
Viktor Lofgren
8a944cf4c6 (crawler) Add request time to crawl data
This is an interesting indicator of website quality.
2025-05-19 14:07:41 +02:00

View File

@@ -328,6 +328,7 @@ public record SlopCrawlDataRecord(String domain,
private final LongColumn.Writer timestampColumnWriter; private final LongColumn.Writer timestampColumnWriter;
private final EnumColumn.Writer contentTypeColumnWriter; private final EnumColumn.Writer contentTypeColumnWriter;
private final ByteArrayColumn.Writer bodyColumnWriter; private final ByteArrayColumn.Writer bodyColumnWriter;
private final ShortColumn.Writer requestTimeColumnWriter;
private final StringColumn.Writer headerColumnWriter; private final StringColumn.Writer headerColumnWriter;
public Writer(Path path) throws IOException { public Writer(Path path) throws IOException {
@@ -341,6 +342,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter = timestampColumn.create(this); timestampColumnWriter = timestampColumn.create(this);
contentTypeColumnWriter = contentTypeColumn.create(this); contentTypeColumnWriter = contentTypeColumn.create(this);
bodyColumnWriter = bodyColumn.create(this); bodyColumnWriter = bodyColumn.create(this);
requestTimeColumnWriter = requestTimeColumn.create(this);
headerColumnWriter = headerColumn.create(this); headerColumnWriter = headerColumn.create(this);
} }
@@ -353,6 +355,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnWriter.put(record.timestamp); timestampColumnWriter.put(record.timestamp);
contentTypeColumnWriter.put(record.contentType); contentTypeColumnWriter.put(record.contentType);
bodyColumnWriter.put(record.body); bodyColumnWriter.put(record.body);
requestTimeColumnWriter.put((short) record.requestTimeMs);
headerColumnWriter.put(record.headers); headerColumnWriter.put(record.headers);
} }
@@ -493,8 +496,18 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader = timestampColumn.open(this); timestampColumnReader = timestampColumn.open(this);
contentTypeColumnReader = contentTypeColumn.open(this); contentTypeColumnReader = contentTypeColumn.open(this);
bodyColumnReader = bodyColumn.open(this); bodyColumnReader = bodyColumn.open(this);
requestTimeColumnReader = requestTimeColumn.open(this);
headerColumnReader = headerColumn.open(this); headerColumnReader = headerColumn.open(this);
// FIXME: After 2025-06-XX, we can remove this migration workaround
ShortColumn.Reader timeColumnReader;
try {
timeColumnReader = requestTimeColumn.open(this);
}
catch (Exception ex) {
// Migration workaround
timeColumnReader = null;
}
requestTimeColumnReader = timeColumnReader;
} }
public SlopCrawlDataRecord get() throws IOException { public SlopCrawlDataRecord get() throws IOException {
@@ -507,7 +520,7 @@ public record SlopCrawlDataRecord(String domain,
timestampColumnReader.get(), timestampColumnReader.get(),
contentTypeColumnReader.get(), contentTypeColumnReader.get(),
bodyColumnReader.get(), bodyColumnReader.get(),
requestTimeColumnReader.get(), requestTimeColumnReader != null ? requestTimeColumnReader.get() : -1,
headerColumnReader.get() headerColumnReader.get()
); );
} }