1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

..

4 Commits

Author SHA1 Message Date
Viktor Lofgren
c246a59158 (search) Make it clearer that it's a search engine 2025-04-22 16:03:42 +02:00
Viktor
0b99781d24 Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler
Pdf support in crawler
2025-04-22 15:52:41 +02:00
Viktor Lofgren
39db9620c1 (crawler) Increase maximum permitted file size to 32 MB 2025-04-22 15:51:03 +02:00
Viktor Lofgren
1781599363 (crawler) Add support for crawling PDF files 2025-04-22 15:50:05 +02:00
9 changed files with 59 additions and 14 deletions

View File

@@ -495,7 +495,7 @@ public class CrawlerMain extends ProcessMainClass {
// (mostly a case when migrating from legacy->warc)
reference.delete();
// Convert the WARC file to Parquet
// Convert the WARC file to Slop
SlopCrawlDataRecord
.convertWarc(domain, userAgent, newWarcFile, slopFile);

View File

@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
static final int MAX_TIME = 30_000;
/** Maximum (decompressed) size we'll save */
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
private final WarcWriter writer;
private final Path warcFile;

View File

@@ -6,6 +6,7 @@ public class ContentTypes {
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
"application/xhtml",
"text/html",
"application/pdf",
"image/x-icon",
"text/plain");
@@ -19,4 +20,9 @@ public class ContentTypes {
return false;
}
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
return lcHeader.startsWith("application/pdf");
}
}

View File

@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
public boolean filter(String url, int status, String contentType) {
String ctLc = contentType.toLowerCase();
// Permit all plain text content types
if (ctLc.startsWith("text/"))
return true;
// PDF
else if (ctLc.startsWith("application/pdf"))
return true;
else if (ctLc.startsWith("x-marginalia/"))
return true;

View File

@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
public class ContentTypeLogic {
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
"application/rss+xml",
"application/x-rss+xml",
"application/rdf+xml",
"application/pdf",
"x-rss+xml"
);
private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
public boolean isUrlLikeBinary(EdgeUrl url) {
String pathLowerCase = url.path.toLowerCase();
if (probableHtmlPattern.test(pathLowerCase))
if (probableGoodPattern.test(pathLowerCase))
return false;
return probableBinaryPattern.test(pathLowerCase);

View File

@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
return false;
}
// If the format is binary, we don't want to translate it if the response is truncated
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
return false;
}
return true;
}

View File

@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
private static EdgeUrl badHttpStatusUrl;
private static EdgeUrl keepAliveUrl;
private static EdgeUrl pdfUrl;
@BeforeAll
public static void setupAll() throws URISyntaxException {
wireMockServer =
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
));
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
.willReturn(WireMock.aResponse()
.withHeader("Content-Type", "application/pdf")
.withStatus(200)
.withBody("Hello World")));
wireMockServer.start();
}
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
Assertions.assertTrue(result.isOk());
}
@Test
public void testPdf() {
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
}
private List<WarcRecord> getWarcRecords() throws IOException {
List<WarcRecord> records = new ArrayList<>();

View File

@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.slop.SlopCrawlDataRecord;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WarcRecorderTest {
Path fileNameWarc;
Path fileNameParquet;
Path fileNameSlop;
WarcRecorder client;
HttpClient httpClient;
@@ -39,7 +40,7 @@ class WarcRecorderTest {
httpClient = HttpClients.createDefault();
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
fileNameSlop = Files.createTempFile("test", ".slop.zip");
client = new WarcRecorder(fileNameWarc);
}
@@ -159,17 +160,28 @@ class WarcRecorderTest {
client.fetch(httpClient, new DomainCookies(), request3);
CrawledDocumentParquetRecordFileWriter.convertWarc(
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
request4.addHeader("User-agent", "test.marginalia.nu");
request4.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request4);
SlopCrawlDataRecord.convertWarc(
"www.marginalia.nu",
new UserAgent("test", "test"),
fileNameWarc,
fileNameParquet);
fileNameSlop);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
assertEquals(2, urls.size());
List<String> urls;
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
}
assertEquals(3, urls.size());
assertEquals("https://www.marginalia.nu/", urls.get(0));
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
// sanic.jpg gets filtered out for its bad mime type
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
}

View File

@@ -13,7 +13,7 @@
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}"
autofocus
placeholder="Search..."
placeholder="Search the web!"
autocomplete="off"
name="query"
id="searchInput" />
@@ -21,7 +21,7 @@
<input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}"
placeholder="Search..."
placeholder="Search the web!"
autocomplete="off"
name="query"
id="searchInput" />