1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

5 Commits

6 changed files with 14 additions and 26 deletions

View File

@@ -448,13 +448,7 @@ public class CrawlerMain extends ProcessMainClass {
// We don't have a lock, so we can't run this task // We don't have a lock, so we can't run this task
// we return to avoid blocking the pool for too long // we return to avoid blocking the pool for too long
if (lock.isEmpty()) { if (lock.isEmpty()) {
if (retryQueue.remainingCapacity() > 0) { pendingCrawlTasks.remove(domain);
// Sleep a moment to avoid busy looping via the retry queue
// in the case when few tasks remain and almost all are ineligible for
// immediate restart
Thread.sleep(5);
}
retryQueue.put(this); retryQueue.put(this);
return; return;
} }

View File

@@ -74,7 +74,7 @@ public class CrawlerRevisitor {
// If the reference document is empty or the HTTP status is not 200, we'll skip it since it's // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's
// unlikely to produce anything meaningful for us. // unlikely to produce anything meaningful for us.
if (doc.httpStatus != 200) if (doc.httpStatus != 200 && doc.httpStatus != 206)
continue; continue;
if (!doc.hasBody()) if (!doc.hasBody())
continue; continue;

View File

@@ -58,7 +58,7 @@ public record DocumentWithReference(
if (null == doc) if (null == doc)
return ContentTags.empty(); return ContentTags.empty();
if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200) if (doc.documentBodyBytes.length == 0 || (doc.httpStatus != 200 && doc.httpStatus != 206))
return ContentTags.empty(); return ContentTags.empty();
String lastmod = doc.getLastModified(); String lastmod = doc.getLastModified();

View File

@@ -1,5 +1,7 @@
package nu.marginalia; package nu.marginalia;
import org.apache.commons.lang3.StringUtils;
import java.util.Set; import java.util.Set;
public class ContentTypes { public class ContentTypes {
@@ -11,9 +13,9 @@ public class ContentTypes {
"text/plain"); "text/plain");
public static boolean isAccepted(String contentTypeHeader) { public static boolean isAccepted(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase(); String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
for (var type : acceptedContentTypes) { for (var type : acceptedContentTypes) {
if (lcHeader.startsWith(type)) { if (lcHeader.equals(type)) {
return true; return true;
} }
} }
@@ -21,7 +23,7 @@ public class ContentTypes {
} }
public static boolean isBinary(String contentTypeHeader) { public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase(); String lcHeader = StringUtils.substringBefore(contentTypeHeader.toLowerCase(), ';');
return lcHeader.startsWith("application/pdf"); return lcHeader.startsWith("application/pdf");
} }

View File

@@ -277,7 +277,8 @@ public record SlopCrawlDataRecord(String domain,
try (var table = new SlopTable(path)) { try (var table = new SlopTable(path)) {
ShortColumn.Reader statusReader = statusColumn.open(table); ShortColumn.Reader statusReader = statusColumn.open(table);
while (statusReader.hasRemaining()) { while (statusReader.hasRemaining()) {
if (statusReader.get() == 200) { int status = statusReader.get();
if (status == 200 || status == 206) {
cnt++; cnt++;
} }
} }

View File

@@ -21,10 +21,7 @@ import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.nio.file.attribute.PosixFilePermissions; import java.nio.file.attribute.PosixFilePermissions;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.*;
import java.util.Collections;
import java.util.List;
import java.util.NoSuchElementException;
public class SampleDataExporter { public class SampleDataExporter {
private final FileStorageService storageService; private final FileStorageService storageService;
@@ -127,14 +124,8 @@ public class SampleDataExporter {
var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) { var reader = new SlopCrawlDataRecord.FilteringReader(crawlDataPath) {
@Override @Override
public boolean filter(String url, int status, String contentType) { public boolean filter(String url, int status, String contentType) {
if (contentTypeFilter.equals(contentType)) return Objects.equals(StringUtils.substringBefore(contentType, ';'), contentTypeFilter)
return true; || contentType.startsWith("x-marginalia/"); // metadata records
else if (contentType.startsWith("x-marginalia/"))
// This is a metadata entry, typically domain or redirect information
// let's keep those to not confuse the consumer of the data, which might
// expect at least the domain summary
return true;
return false;
} }
} }
) { ) {
@@ -143,7 +134,7 @@ public class SampleDataExporter {
var entry = reader.get(); var entry = reader.get();
writer.write(entry); writer.write(entry);
wroteEntry = wroteEntry || contentTypeFilter.equals(entry.contentType()); wroteEntry = wroteEntry || Objects.equals(StringUtils.substringBefore(entry.contentType(), ';'), contentTypeFilter);
} }
if (!wroteEntry) { if (!wroteEntry) {