1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 17:32:39 +02:00

Compare commits

...

11 Commits

Author SHA1 Message Date
Viktor Lofgren
fc92e9b9c0 (feeds) Correct link handling in atom feeds
This addresses issue #199
2025-05-09 13:00:07 +02:00
Viktor Lofgren
328fb5d927 (feeds) Correct link handling in atom feeds
This addresses issue #199
2025-05-09 12:55:28 +02:00
Viktor Lofgren
5e2b63473e (logging) Change to a terser log format
The old log format would often span several screen widths, especially when subprocesses logged.  Switching to a terser format that should be much easier to read.
2025-05-08 18:02:22 +02:00
Viktor
f9590703f1 Merge pull request #197 from MarginaliaSearch/crawl-markdown
(markdown) Support crawling markdown
2025-05-08 13:35:00 +02:00
Viktor Lofgren
f12fc11337 (markdown) Support crawling markdown 2025-05-08 13:26:22 +02:00
Viktor Lofgren
c309030184 (sample) Ensure we finalize the slop.zip file creation when filtering 2025-05-06 14:52:48 +02:00
Viktor Lofgren
fd5af01629 (sample) Ensure we flush the log before adding it to the tar file 2025-05-06 14:43:47 +02:00
Viktor Lofgren
d4c43c7a79 (crawler) Test case for fetching PDFs 2025-05-06 13:45:16 +02:00
Viktor Lofgren
18700e1919 (sample) Fix bug where slop files would not be saved despite containing data 2025-05-06 13:38:21 +02:00
Viktor Lofgren
120b431998 (crawler) Fix outdated assumptions about content types and http status codes always being 200 when good.
We now sometimes get 206 when good.
2025-05-06 13:18:30 +02:00
Viktor Lofgren
71dad99326 (crawler) Revisitor should not demand a 200, but support a 206 as well 2025-05-06 13:11:52 +02:00
12 changed files with 272 additions and 49 deletions

View File

@@ -3,11 +3,18 @@
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
<Filters>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ProcessConsole" target="SYSTEM_OUT">
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
<Filters>
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
</Filters>
</Console>
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
ignoreExceptions="false">
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
@@ -15,6 +22,7 @@
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
<SizeBasedTriggeringPolicy size="10MB" />
</RollingFile>
@@ -34,6 +42,7 @@
<Root level="info">
<AppenderRef ref="Console"/>
<AppenderRef ref="ProcessConsole"/>
<AppenderRef ref="LogToFile"/>
</Root>
</Loggers>

View File

@@ -1,13 +1,51 @@
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
<Console name="ConsoleInfo" target="SYSTEM_OUT">
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleWarn" target="SYSTEM_OUT">
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleError" target="SYSTEM_OUT">
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleFatal" target="SYSTEM_OUT">
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ProcessConsole" target="SYSTEM_OUT">
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
<Filters>
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
</Filters>
</Console>
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
ignoreExceptions="false">
<PatternLayout>
@@ -36,7 +74,11 @@
<Logger name="org.apache.zookeeper" level="WARN" />
<Root level="info">
<AppenderRef ref="Console"/>
<AppenderRef ref="ConsoleInfo"/>
<AppenderRef ref="ConsoleWarn"/>
<AppenderRef ref="ConsoleError"/>
<AppenderRef ref="ConsoleFatal"/>
<AppenderRef ref="ProcessConsole"/>
<AppenderRef ref="LogToFile"/>
</Root>
</Loggers>

View File

@@ -1,15 +1,49 @@
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
<Console name="ConsoleInfo" target="SYSTEM_OUT">
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleWarn" target="SYSTEM_OUT">
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleError" target="SYSTEM_OUT">
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ConsoleFatal" target="SYSTEM_OUT">
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
<Filters>
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
</Filters>
</Console>
<Console name="ProcessConsole" target="SYSTEM_OUT">
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
<Filters>
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
</Filters>
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache.zookeeper" level="WARN" />
<Root level="info">
<AppenderRef ref="Console"/>
<AppenderRef ref="LogToFile"/>
<AppenderRef ref="ConsoleInfo"/>
<AppenderRef ref="ConsoleWarn"/>
<AppenderRef ref="ConsoleError"/>
<AppenderRef ref="ConsoleFatal"/>
<AppenderRef ref="ProcessConsole"/>
</Root>
</Loggers>
</Configuration>

View File

@@ -79,9 +79,17 @@ public class SimpleFeedParser {
if (!link.isBlank())
break;
var tag = element.getElementsByTag(attr).first();
if (tag != null) {
link = tag.text();
String linkText = tag.text();
if (linkText.isBlank()) {
linkText = tag.attr("href");
}
link = linkText;
}
}
ret.add(new ItemData(title, description, link, pubDate));

View File

@@ -67,8 +67,6 @@ dependencies {
testImplementation libs.mockito
testImplementation libs.wiremock
testImplementation project(':code:processes:test-data')
}

View File

@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
// unlikely to produce anything meaningful for us.
if (doc.httpStatus != 200)
if (doc.httpStatus != 200 && doc.httpStatus != 206)
continue;
if (!doc.hasBody())
continue;

View File

@@ -58,7 +58,7 @@ public record DocumentWithReference(
if (null == doc)
return ContentTags.empty();
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200)
if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
return ContentTags.empty();
String lastmod = doc.getLastModified();

View File

@@ -1,19 +1,23 @@
package nu.marginalia;
import org.apache.commons.lang3.StringUtils;
import java.util.Set;
public class ContentTypes {
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
"application/xhtml",
"text/html",
"text/markdown",
"text/x-markdown",
"application/pdf",
"image/x-icon",
"text/plain");
public static boolean isAccepted(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
for (var type : acceptedContentTypes) {
if (lcHeader.startsWith(type)) {
if (lcHeader.equals(type)) {
return true;
}
}
@@ -21,7 +25,7 @@ public class ContentTypes {
}
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
return lcHeader.startsWith("application/pdf");
}

View File

@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
// and is used to store old responses from previous crawls; in this part of the logic
// we treat them the same as a normal response
if (!filterResponse(uaString, response)) {
var filterStatus = filterResponse(uaString, response);
if (filterStatus.isRejected()) {
continue;
}
slopWriter.write(domain, response);
slopWriter.write(domain, filterStatus, response);
} else if (record instanceof WarcXEntityRefused refused) {
slopWriter.write(domain, refused);
} else if (record instanceof Warcinfo warcinfo) {
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
}
}
sealed interface ResponseFilterResult {
default boolean isRejected() { return false; }
record Accept() implements ResponseFilterResult {}
record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
record Reject() implements ResponseFilterResult {
@Override
public boolean isRejected() { return true; }
}
}
/** Return true if the WarcResponse should be excluded from conversion */
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {
// We don't want to store robots.txt files, as they are not
// interesting for the analysis we want to do. This is important
// since txt-files in general are interesting, and we don't want to
// exclude them as a class.
if (response.targetURI().getPath().equals("/robots.txt")) {
return false;
String uriPath = response.targetURI().getPath();
if (uriPath.equals("/robots.txt")) {
return new ResponseFilterResult.Reject();
}
var headers = response.http().headers();
var robotsTags = headers.all("X-Robots-Tag");
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
return false;
return new ResponseFilterResult.Reject();
}
// Strip out responses with content types we aren't interested in
@@ -213,15 +224,29 @@ public record SlopCrawlDataRecord(String domain,
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
if (!ContentTypes.isAccepted(contentType)) {
return false;
String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
// Some servers don't understand what a markdown file is
if (contentTypeWithoutParams.equals("application/octet-stream")) {
if (uriPath.endsWith(".md")) {
// This is a markdown file, which we want to keep
return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
}
else if (uriPath.endsWith(".pdf")) {
// This is a text file, which we want to keep
return new ResponseFilterResult.AcceptWithContentType("application/pdf");
}
}
return new ResponseFilterResult.Reject();
}
// If the format is binary, we don't want to translate it if the response is truncated
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
return false;
return new ResponseFilterResult.Reject();
}
return true;
return new ResponseFilterResult.Accept();
}
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
@@ -277,7 +302,8 @@ public record SlopCrawlDataRecord(String domain,
try (var table = new SlopTable(path)) {
ShortColumn.Reader statusReader = statusColumn.open(table);
while (statusReader.hasRemaining()) {
if (statusReader.get() == 200) {
int status = statusReader.get();
if (status == 200 || status == 206) {
cnt++;
}
}
@@ -323,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
headerColumnWriter.put(record.headers);
}
public void write(String domain, WarcResponse response) throws IOException {
public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {
HttpFetchResult result = HttpFetchResult.importWarc(response);
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
@@ -346,6 +372,21 @@ public record SlopCrawlDataRecord(String domain,
contentType = "";
}
switch (filterStatus) {
case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
try {
// Parse the body as UTF-8
new String(bodyBytes, StandardCharsets.UTF_8);
contentType = ct;
}
catch (RuntimeException ex) { // UTF-8 decoding failed
return;
}
}
default -> {}
}
boolean hasCookies = false;
String headersStr;

View File

@@ -117,6 +117,100 @@ class CrawlerRetreiverTest {
}
}
@Test
public void verifyFileFormatSupport() throws IOException {
List<String> urls = List.of(
"https://www.marginalia.nu/junk/test.pdf",
"https://www.marginalia.nu/junk/test.md"
);
var specs = CrawlerMain.CrawlSpecRecord
.builder()
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(urls)
.build();
Path tempFile = null;
Path slopFile = null;
try {
tempFile = Files.createTempFile("crawling-process", "warc");
slopFile = Files.createTempFile("crawling-process", ".slop.zip");
doCrawl(tempFile, specs);
Set<String> requests = new HashSet<>();
Set<String> responses = new HashSet<>();
// Inspect the WARC file
try (var reader = new WarcReader(tempFile)) {
reader.forEach(record -> {
if (record instanceof WarcRequest req) {
requests.add(req.target());
System.out.println(req.type() + ":" + req.target());
}
else if (record instanceof WarcResponse rsp) {
responses.add(rsp.target());
try {
System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
else {
System.out.println(record.type());
}
});
}
for (var url : urls) {
assertTrue(requests.contains(url), "Should have requested " + url);
}
assertEquals(requests, responses);
// Convert the WARC file to a Slop file
SlopCrawlDataRecord
.convertWarc("www.marginalia.nu", new UserAgent("test.marginalia.nu", "test.marginalia.nu"), tempFile, slopFile);
CrawledDomain domain = null;
Map<String, CrawledDocument> documents = new HashMap<>();
// Extract the contents of the Slop file
try (var stream = SerializableCrawlDataStream.openDataStream(slopFile)) {
while (stream.hasNext()) {
var doc = stream.next();
if (doc instanceof CrawledDomain dr) {
assertNull(domain);
domain = dr;
}
else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "\t" + dc.crawlerStatus + "\t" + dc.httpStatus);
documents.put(dc.url, dc);
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
for (var url : urls) {
// Verify we have the downloaded files in the Slop file
assertNotNull(domain);
var fetchedDoc = documents.get(url);
assertNotNull(fetchedDoc, "Should have a document for " + url);
assertEquals(url, fetchedDoc.url);
assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
}
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
if (tempFile != null)
Files.deleteIfExists(tempFile);
if (slopFile != null)
Files.deleteIfExists(slopFile);
}
}
@Test
public void testWarcOutputNoKnownUrls() throws IOException {
var specs = CrawlerMain.CrawlSpecRecord

View File

@@ -21,10 +21,7 @@ import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.*;
public class SampleDataExporter {
private final FileStorageService storageService;
@@ -101,6 +98,8 @@ public class SampleDataExporter {
}
}
logWriter.flush();
addFileToTar(stream, newCrawlerLogFile, "crawler.log");
addFileToTar(stream, newManifestJsonFile, "marginalia-manifest.json");
}
@@ -123,23 +122,31 @@ public class SampleDataExporter {
}
Files.createDirectory(tempDir);
boolean wroteEntry = false;
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override
public boolean filter(String url, int status, String contentType) {
return matchContentTypeHeaderWithMime(contentType, contentTypeFilter)
return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
|| contentType.startsWith("x-marginalia/"); // metadata records
}
}
) {
boolean wroteEntry = false;
while (reader.hasRemaining()) {
var entry = reader.get();
writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType());
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
}
}
catch (Exception ex) {
FileUtils.deleteDirectory(tempDir.toFile());
throw ex;
}
try {
if (!wroteEntry) {
throw new NoSuchElementException("No relevant entries");
}
@@ -154,21 +161,6 @@ public class SampleDataExporter {
return tempFile;
}
private boolean matchContentTypeHeaderWithMime(String contentType, String mime) {
if (null == contentType) {
return false;
}
/* The content type header may have a charset or other parameters, so we need to
* check if the mime type is a prefix of the content type. */
int semicolonIndex = contentType.indexOf(';');
if (semicolonIndex >= 0) {
return contentType.substring(0, semicolonIndex).equals(mime);
}
return contentType.equals(mime);
}
private void addFileToTar(TarArchiveOutputStream outputStream, Path file, String fileName) throws IOException {
var entry = outputStream.createArchiveEntry(file.toFile(), fileName);
entry.setSize(Files.size(file));

View File

@@ -7,3 +7,4 @@
2025-05-04: Deploy qs, search and api-services.
2025-05-05: Deploy executor partition 4.
2025-05-05: Deploy control.
2025-05-08: Deploy assistant.