1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

8 Commits

Author SHA1 Message Date
Viktor Lofgren
c246a59158 (search) Make it clearer that it's a search engine 2025-04-22 16:03:42 +02:00
Viktor
0b99781d24 Merge pull request #191 from MarginaliaSearch/pdf-support-in-crawler
Pdf support in crawler
2025-04-22 15:52:41 +02:00
Viktor Lofgren
39db9620c1 (crawler) Increase maximum permitted file size to 32 MB 2025-04-22 15:51:03 +02:00
Viktor Lofgren
1781599363 (crawler) Add support for crawling PDF files 2025-04-22 15:50:05 +02:00
Viktor Lofgren
6b2d18fb9b (crawler) Adjust domain limits to be generally more permissive. 2025-04-22 15:27:57 +02:00
Viktor
59b1d200ab Merge pull request #190 from MarginaliaSearch/download-sample-chores
Download sample chores
2025-04-22 13:29:49 +02:00
Viktor Lofgren
897010a2cf (control) Update download sample data actor with better UI
The original implementation didn't really give a lot of feedback about what it was doing.  Adding a progress bar to the download step.

Relates to issue 189.
2025-04-22 13:27:22 +02:00
Viktor Lofgren
602af7a77e (control) Update UI with new sample sizes
Relates to issue 189.
2025-04-22 13:27:13 +02:00
12 changed files with 101 additions and 28 deletions

View File

@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.actor.state.Resume;
import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
private final FileStorageService storageService;
private final ServiceEventLog eventLog;
private final ServiceHeartbeat heartbeat;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Resume(behavior = ActorResumeBehavior.ERROR)
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
Files.deleteIfExists(Path.of(tarFileName));
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
is.transferTo(os);
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
long size = urlConnection.getContentLengthLong();
byte[] buffer = new byte[8192];
try (var is = new BufferedInputStream(urlConnection.getInputStream());
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
long copiedSize = 0;
while (copiedSize < size) {
int read = is.read(buffer);
if (read < 0) // We've been promised a file of length 'size'
throw new IOException("Unexpected end of stream");
os.write(buffer, 0, read);
copiedSize += read;
// Update progress bar
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
}
}
}
catch (Exception ex) {
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
logger.error("Error downloading sample", ex);
yield new Error();
}
finally {
urlConnection.disconnect();
}
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
yield new Extract(fileStorageId, tarFileName);
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
@Inject
public DownloadSampleActor(Gson gson,
FileStorageService storageService,
ServiceEventLog eventLog)
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
{
super(gson);
this.storageService = storageService;
this.eventLog = eventLog;
this.heartbeat = heartbeat;
}
}

View File

@@ -495,7 +495,7 @@ public class CrawlerMain extends ProcessMainClass {
// (mostly a case when migrating from legacy->warc)
reference.delete();
// Convert the WARC file to Parquet
// Convert the WARC file to Slop
SlopCrawlDataRecord
.convertWarc(domain, userAgent, newWarcFile, slopFile);

View File

@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
static final int MAX_TIME = 30_000;
/** Maximum (decompressed) size we'll save */
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
private final WarcWriter writer;
private final Path warcFile;

View File

@@ -43,18 +43,18 @@ public class DomainLocks {
return new Semaphore(16);
if (topDomain.equals("blogspot.com"))
return new Semaphore(8);
if (topDomain.equals("tumblr.com"))
return new Semaphore(8);
if (topDomain.equals("neocities.org"))
return new Semaphore(4);
return new Semaphore(8);
if (topDomain.equals("github.io"))
return new Semaphore(4);
return new Semaphore(8);
// Substack really dislikes broad-scale crawlers, so we need to be careful
// to not get blocked.
if (topDomain.equals("substack.com")) {
return new Semaphore(1);
}
if (topDomain.endsWith(".edu")) {
return new Semaphore(1);
}
return new Semaphore(2);
}

View File

@@ -6,6 +6,7 @@ public class ContentTypes {
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
"application/xhtml",
"text/html",
"application/pdf",
"image/x-icon",
"text/plain");
@@ -19,4 +20,9 @@ public class ContentTypes {
return false;
}
public static boolean isBinary(String contentTypeHeader) {
String lcHeader = contentTypeHeader.toLowerCase();
return lcHeader.startsWith("application/pdf");
}
}

View File

@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
public boolean filter(String url, int status, String contentType) {
String ctLc = contentType.toLowerCase();
// Permit all plain text content types
if (ctLc.startsWith("text/"))
return true;
// PDF
else if (ctLc.startsWith("application/pdf"))
return true;
else if (ctLc.startsWith("x-marginalia/"))
return true;

View File

@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
public class ContentTypeLogic {
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
private static final List<String> acceptedContentTypePrefixes = List.of(
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
"application/rss+xml",
"application/x-rss+xml",
"application/rdf+xml",
"application/pdf",
"x-rss+xml"
);
private boolean allowAllContentTypes = false;
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
public boolean isUrlLikeBinary(EdgeUrl url) {
String pathLowerCase = url.path.toLowerCase();
if (probableHtmlPattern.test(pathLowerCase))
if (probableGoodPattern.test(pathLowerCase))
return false;
return probableBinaryPattern.test(pathLowerCase);

View File

@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
return false;
}
// If the format is binary, we don't want to translate it if the response is truncated
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
return false;
}
return true;
}

View File

@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
private static EdgeUrl badHttpStatusUrl;
private static EdgeUrl keepAliveUrl;
private static EdgeUrl pdfUrl;
@BeforeAll
public static void setupAll() throws URISyntaxException {
wireMockServer =
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
));
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
.willReturn(WireMock.aResponse()
.withHeader("Content-Type", "application/pdf")
.withStatus(200)
.withBody("Hello World")));
wireMockServer.start();
}
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
Assertions.assertTrue(result.isOk());
}
@Test
public void testPdf() {
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
Assertions.assertTrue(result.isOk());
}
private List<WarcRecord> getWarcRecords() throws IOException {
List<WarcRecord> records = new ArrayList<>();

View File

@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.DomainCookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.slop.SlopCrawlDataRecord;
import org.apache.hc.client5.http.classic.HttpClient;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.HttpClients;
@@ -24,13 +24,14 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WarcRecorderTest {
Path fileNameWarc;
Path fileNameParquet;
Path fileNameSlop;
WarcRecorder client;
HttpClient httpClient;
@@ -39,7 +40,7 @@ class WarcRecorderTest {
httpClient = HttpClients.createDefault();
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
fileNameSlop = Files.createTempFile("test", ".slop.zip");
client = new WarcRecorder(fileNameWarc);
}
@@ -159,17 +160,28 @@ class WarcRecorderTest {
client.fetch(httpClient, new DomainCookies(), request3);
CrawledDocumentParquetRecordFileWriter.convertWarc(
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
request4.addHeader("User-agent", "test.marginalia.nu");
request4.addHeader("Accept-Encoding", "gzip");
client.fetch(httpClient, new DomainCookies(), request4);
SlopCrawlDataRecord.convertWarc(
"www.marginalia.nu",
new UserAgent("test", "test"),
fileNameWarc,
fileNameParquet);
fileNameSlop);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
assertEquals(2, urls.size());
List<String> urls;
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
}
assertEquals(3, urls.size());
assertEquals("https://www.marginalia.nu/", urls.get(0));
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
// sanic.jpg gets filtered out for its bad mime type
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
}

View File

@@ -13,7 +13,7 @@
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}"
autofocus
placeholder="Search..."
placeholder="Search the web!"
autocomplete="off"
name="query"
id="searchInput" />
@@ -21,7 +21,7 @@
<input type="text"
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
value="${query}"
placeholder="Search..."
placeholder="Search the web!"
autocomplete="off"
name="query"
id="searchInput" />

View File

@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
<tr>
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-s">Small</label></td>
<td>1000 Domains. About 2 GB. </td>
<td>1000 Domains. About 1 GB. </td>
</tr>
<tr>
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-m">Medium</label></td>
<td>2000 Domains. About 6 GB. Recommended.</td>
<td>2000 Domains. About 2 GB. Recommended.</td>
</tr>
<tr>
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-l">Large</label></td>
<td>5000 Domains. About 20 GB.</td>
<td>5000 Domains. About 7 GB.</td>
</tr>
<tr>
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
<td><label for="sample-xl">Huge</label></td>
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
<td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
Expect hours of processing time. </td>
</tr>
</table>