mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
4 Commits
deploy-017
...
deploy-017
Author | SHA1 | Date | |
---|---|---|---|
|
5e2b63473e | ||
|
f9590703f1 | ||
|
f12fc11337 | ||
|
c309030184 |
@@ -3,11 +3,18 @@
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{P}{FG_Cyan} %msg%n"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<JSONLayout compact="true" eventEol="true" properties="true" stacktraceAsString="true" includeTimeMillis="true"/>
|
||||
@@ -15,6 +22,7 @@
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
<SizeBasedTriggeringPolicy size="10MB" />
|
||||
</RollingFile>
|
||||
@@ -34,6 +42,7 @@
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,13 +1,51 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="QUERY" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="HTTP" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
<MarkerFilter marker="CRAWLER" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<RollingFile name="LogToFile" fileName="${env:WMSA_LOG_DIR:-/var/log/wmsa}/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}.log" filePattern="/var/log/wmsa/wmsa-${sys:service-name}-${env:WMSA_SERVICE_NODE:-0}-log-%d{MM-dd-yy-HH-mm-ss}-%i.log.gz"
|
||||
ignoreExceptions="false">
|
||||
<PatternLayout>
|
||||
@@ -36,7 +74,11 @@
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
|
@@ -1,15 +1,49 @@
|
||||
<Configuration xmlns="http://logging.apache.org/log4j/2.0/config" >
|
||||
<Appenders>
|
||||
<Console name="Console" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss,SSS} %style{%-8markerSimpleName}{FG_Cyan} %highlight{%-5level}{FATAL=red, ERROR=red, WARN=yellow} %-24t %-20c{1} -- %msg%n"/>
|
||||
<Console name="ConsoleInfo" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="- %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="INFO" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleWarn" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="⚠ %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="WARN" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleError" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="🔥 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="ERROR" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ConsoleFatal" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="💀 %d{HH:mm:ss,SSS} %-20c{1} -- %msg%n"/>
|
||||
<Filters>
|
||||
<LevelMatchFilter level="FATAL" onMatch="ALLOW" onMismatch="DENY"/>
|
||||
<MarkerFilter marker="PROCESS" onMatch="DENY" onMismatch="NEUTRAL" />
|
||||
</Filters>
|
||||
</Console>
|
||||
<Console name="ProcessConsole" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%style{%msg%n}{FG_Cyan}"/>
|
||||
<Filters>
|
||||
<MarkerFilter marker="PROCESS" onMatch="ALLOW" onMismatch="DENY" />
|
||||
</Filters>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN" />
|
||||
|
||||
<Root level="info">
|
||||
<AppenderRef ref="Console"/>
|
||||
<AppenderRef ref="LogToFile"/>
|
||||
<AppenderRef ref="ConsoleInfo"/>
|
||||
<AppenderRef ref="ConsoleWarn"/>
|
||||
<AppenderRef ref="ConsoleError"/>
|
||||
<AppenderRef ref="ConsoleFatal"/>
|
||||
<AppenderRef ref="ProcessConsole"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
@@ -8,6 +8,8 @@ public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
@@ -158,11 +158,12 @@ public record SlopCrawlDataRecord(String domain,
|
||||
// and is used to store old responses from previous crawls; in this part of the logic
|
||||
// we treat them the same as a normal response
|
||||
|
||||
if (!filterResponse(uaString, response)) {
|
||||
var filterStatus = filterResponse(uaString, response);
|
||||
if (filterStatus.isRejected()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
slopWriter.write(domain, response);
|
||||
slopWriter.write(domain, filterStatus, response);
|
||||
} else if (record instanceof WarcXEntityRefused refused) {
|
||||
slopWriter.write(domain, refused);
|
||||
} else if (record instanceof Warcinfo warcinfo) {
|
||||
@@ -187,25 +188,35 @@ public record SlopCrawlDataRecord(String domain,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sealed interface ResponseFilterResult {
|
||||
default boolean isRejected() { return false; }
|
||||
record Accept() implements ResponseFilterResult {}
|
||||
record AcceptWithContentType(String contentType) implements ResponseFilterResult {}
|
||||
record AcceptIfPlainText(String contentType) implements ResponseFilterResult {}
|
||||
record Reject() implements ResponseFilterResult {
|
||||
@Override
|
||||
public boolean isRejected() { return true; }
|
||||
}
|
||||
}
|
||||
|
||||
/** Return true if the WarcResponse should be excluded from conversion */
|
||||
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||
private static ResponseFilterResult filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||
|
||||
// We don't want to store robots.txt files, as they are not
|
||||
// interesting for the analysis we want to do. This is important
|
||||
// since txt-files in general are interesting, and we don't want to
|
||||
// exclude them as a class.
|
||||
|
||||
if (response.targetURI().getPath().equals("/robots.txt")) {
|
||||
return false;
|
||||
String uriPath = response.targetURI().getPath();
|
||||
if (uriPath.equals("/robots.txt")) {
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
var headers = response.http().headers();
|
||||
var robotsTags = headers.all("X-Robots-Tag");
|
||||
|
||||
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||
return false;
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
// Strip out responses with content types we aren't interested in
|
||||
@@ -213,15 +224,29 @@ public record SlopCrawlDataRecord(String domain,
|
||||
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
||||
|
||||
if (!ContentTypes.isAccepted(contentType)) {
|
||||
return false;
|
||||
String contentTypeWithoutParams = StringUtils.substringBefore(contentType, ";");
|
||||
|
||||
// Some servers don't understand what a markdown file is
|
||||
if (contentTypeWithoutParams.equals("application/octet-stream")) {
|
||||
if (uriPath.endsWith(".md")) {
|
||||
// This is a markdown file, which we want to keep
|
||||
return new ResponseFilterResult.AcceptIfPlainText("text/markdown");
|
||||
}
|
||||
else if (uriPath.endsWith(".pdf")) {
|
||||
// This is a text file, which we want to keep
|
||||
return new ResponseFilterResult.AcceptWithContentType("application/pdf");
|
||||
}
|
||||
}
|
||||
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
return new ResponseFilterResult.Reject();
|
||||
}
|
||||
|
||||
return true;
|
||||
return new ResponseFilterResult.Accept();
|
||||
}
|
||||
|
||||
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
||||
@@ -324,7 +349,7 @@ public record SlopCrawlDataRecord(String domain,
|
||||
headerColumnWriter.put(record.headers);
|
||||
}
|
||||
|
||||
public void write(String domain, WarcResponse response) throws IOException {
|
||||
public void write(String domain, ResponseFilterResult filterStatus, WarcResponse response) throws IOException {
|
||||
|
||||
HttpFetchResult result = HttpFetchResult.importWarc(response);
|
||||
if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) {
|
||||
@@ -347,6 +372,21 @@ public record SlopCrawlDataRecord(String domain,
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
switch (filterStatus) {
|
||||
case ResponseFilterResult.AcceptWithContentType(String ct) -> contentType = ct;
|
||||
case ResponseFilterResult.AcceptIfPlainText(String ct) -> {
|
||||
try {
|
||||
// Parse the body as UTF-8
|
||||
new String(bodyBytes, StandardCharsets.UTF_8);
|
||||
contentType = ct;
|
||||
}
|
||||
catch (RuntimeException ex) { // UTF-8 decoding failed
|
||||
return;
|
||||
}
|
||||
}
|
||||
default -> {}
|
||||
}
|
||||
|
||||
boolean hasCookies = false;
|
||||
|
||||
String headersStr;
|
||||
|
@@ -119,12 +119,17 @@ class CrawlerRetreiverTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testWarcOutputPDF() throws IOException {
|
||||
public void verifyFileFormatSupport() throws IOException {
|
||||
List<String> urls = List.of(
|
||||
"https://www.marginalia.nu/junk/test.pdf",
|
||||
"https://www.marginalia.nu/junk/test.md"
|
||||
);
|
||||
|
||||
var specs = CrawlerMain.CrawlSpecRecord
|
||||
.builder()
|
||||
.crawlDepth(5)
|
||||
.domain("www.marginalia.nu")
|
||||
.urls(List.of("https://www.marginalia.nu/junk/test.pdf"))
|
||||
.urls(urls)
|
||||
.build();
|
||||
Path tempFile = null;
|
||||
Path slopFile = null;
|
||||
@@ -146,7 +151,11 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
else if (record instanceof WarcResponse rsp) {
|
||||
responses.add(rsp.target());
|
||||
System.out.println(rsp.type() + ":" + rsp.target());
|
||||
try {
|
||||
System.out.println(rsp.type() + ":" + rsp.target() + ":" + rsp.http().contentType());
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
System.out.println(record.type());
|
||||
@@ -154,7 +163,9 @@ class CrawlerRetreiverTest {
|
||||
});
|
||||
}
|
||||
|
||||
assertTrue(requests.contains("https://www.marginalia.nu/junk/test.pdf"));
|
||||
for (var url : urls) {
|
||||
assertTrue(requests.contains(url), "Should have requested " + url);
|
||||
}
|
||||
assertEquals(requests, responses);
|
||||
|
||||
// Convert the WARC file to a Slop file
|
||||
@@ -181,15 +192,18 @@ class CrawlerRetreiverTest {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// Verify we have a PDF in the Slop file
|
||||
assertNotNull(domain);
|
||||
var pdfDoc = documents.get("https://www.marginalia.nu/junk/test.pdf");
|
||||
assertNotNull(pdfDoc);
|
||||
assertEquals("https://www.marginalia.nu/junk/test.pdf", pdfDoc.url);
|
||||
assertEquals(206, pdfDoc.httpStatus);
|
||||
assertTrue(pdfDoc.documentBodyBytes.length > 100);
|
||||
}
|
||||
finally {
|
||||
for (var url : urls) {
|
||||
// Verify we have the downloaded files in the Slop file
|
||||
assertNotNull(domain);
|
||||
var fetchedDoc = documents.get(url);
|
||||
assertNotNull(fetchedDoc, "Should have a document for " + url);
|
||||
assertEquals(url, fetchedDoc.url);
|
||||
assertTrue(fetchedDoc.httpStatus == 200 || fetchedDoc.httpStatus == 206, "Should be 200 or 206 for " + url);
|
||||
assertTrue(fetchedDoc.documentBodyBytes.length > 32, "Should have a body for " + url);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
if (tempFile != null)
|
||||
Files.deleteIfExists(tempFile);
|
||||
if (slopFile != null)
|
||||
|
@@ -122,6 +122,8 @@ public class SampleDataExporter {
|
||||
}
|
||||
Files.createDirectory(tempDir);
|
||||
|
||||
boolean wroteEntry = false;
|
||||
|
||||
try (var writer = new SlopCrawlDataRecord.Writer(tempDir);
|
||||
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
|
||||
@Override
|
||||
@@ -131,14 +133,20 @@ public class SampleDataExporter {
|
||||
}
|
||||
}
|
||||
) {
|
||||
boolean wroteEntry = false;
|
||||
|
||||
while (reader.hasRemaining()) {
|
||||
var entry = reader.get();
|
||||
writer.write(entry);
|
||||
|
||||
wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
FileUtils.deleteDirectory(tempDir.toFile());
|
||||
throw ex;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!wroteEntry) {
|
||||
throw new NoSuchElementException("No relevant entries");
|
||||
}
|
||||
|
Reference in New Issue
Block a user