mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
8 Commits
deploy-014
...
deploy-014
Author | SHA1 | Date | |
---|---|---|---|
|
c246a59158 | ||
|
0b99781d24 | ||
|
39db9620c1 | ||
|
1781599363 | ||
|
6b2d18fb9b | ||
|
59b1d200ab | ||
|
897010a2cf | ||
|
602af7a77e |
@@ -8,6 +8,7 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
@@ -19,6 +20,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
@@ -32,6 +34,7 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
private final FileStorageService storageService;
|
||||
private final ServiceEventLog eventLog;
|
||||
private final ServiceHeartbeat heartbeat;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.ERROR)
|
||||
@@ -66,15 +69,39 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
|
||||
Files.deleteIfExists(Path.of(tarFileName));
|
||||
|
||||
try (var is = new BufferedInputStream(new URI(downloadURI).toURL().openStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
is.transferTo(os);
|
||||
HttpURLConnection urlConnection = (HttpURLConnection) new URI(downloadURI).toURL().openConnection();
|
||||
|
||||
try (var hb = heartbeat.createServiceAdHocTaskHeartbeat("Downloading sample")) {
|
||||
long size = urlConnection.getContentLengthLong();
|
||||
byte[] buffer = new byte[8192];
|
||||
|
||||
try (var is = new BufferedInputStream(urlConnection.getInputStream());
|
||||
var os = new BufferedOutputStream(Files.newOutputStream(Path.of(tarFileName), StandardOpenOption.CREATE))) {
|
||||
long copiedSize = 0;
|
||||
|
||||
while (copiedSize < size) {
|
||||
int read = is.read(buffer);
|
||||
|
||||
if (read < 0) // We've been promised a file of length 'size'
|
||||
throw new IOException("Unexpected end of stream");
|
||||
|
||||
os.write(buffer, 0, read);
|
||||
copiedSize += read;
|
||||
|
||||
// Update progress bar
|
||||
hb.progress(String.format("%d MB", copiedSize / 1024 / 1024), (int) (copiedSize / 1024), (int) (size / 1024));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Error downloading sample");
|
||||
logger.error("Error downloading sample", ex);
|
||||
yield new Error();
|
||||
}
|
||||
finally {
|
||||
urlConnection.disconnect();
|
||||
}
|
||||
|
||||
eventLog.logEvent(DownloadSampleActor.class, "Download complete");
|
||||
yield new Extract(fileStorageId, tarFileName);
|
||||
@@ -170,11 +197,12 @@ public class DownloadSampleActor extends RecordActorPrototype {
|
||||
@Inject
|
||||
public DownloadSampleActor(Gson gson,
|
||||
FileStorageService storageService,
|
||||
ServiceEventLog eventLog)
|
||||
ServiceEventLog eventLog, ServiceHeartbeat heartbeat)
|
||||
{
|
||||
super(gson);
|
||||
this.storageService = storageService;
|
||||
this.eventLog = eventLog;
|
||||
this.heartbeat = heartbeat;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -495,7 +495,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
reference.delete();
|
||||
|
||||
// Convert the WARC file to Parquet
|
||||
// Convert the WARC file to Slop
|
||||
SlopCrawlDataRecord
|
||||
.convertWarc(domain, userAgent, newWarcFile, slopFile);
|
||||
|
||||
|
@@ -41,7 +41,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
static final int MAX_TIME = 30_000;
|
||||
|
||||
/** Maximum (decompressed) size we'll save */
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 32 * 1024 * 1024);
|
||||
|
||||
private final WarcWriter writer;
|
||||
private final Path warcFile;
|
||||
|
@@ -43,18 +43,18 @@ public class DomainLocks {
|
||||
return new Semaphore(16);
|
||||
if (topDomain.equals("blogspot.com"))
|
||||
return new Semaphore(8);
|
||||
|
||||
if (topDomain.equals("tumblr.com"))
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("neocities.org"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
if (topDomain.equals("github.io"))
|
||||
return new Semaphore(4);
|
||||
return new Semaphore(8);
|
||||
|
||||
// Substack really dislikes broad-scale crawlers, so we need to be careful
|
||||
// to not get blocked.
|
||||
if (topDomain.equals("substack.com")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
if (topDomain.endsWith(".edu")) {
|
||||
return new Semaphore(1);
|
||||
}
|
||||
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
@@ -6,6 +6,7 @@ public class ContentTypes {
|
||||
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||
"application/xhtml",
|
||||
"text/html",
|
||||
"application/pdf",
|
||||
"image/x-icon",
|
||||
"text/plain");
|
||||
|
||||
@@ -19,4 +20,9 @@ public class ContentTypes {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isBinary(String contentTypeHeader) {
|
||||
String lcHeader = contentTypeHeader.toLowerCase();
|
||||
return lcHeader.startsWith("application/pdf");
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -37,8 +37,12 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa
|
||||
public boolean filter(String url, int status, String contentType) {
|
||||
String ctLc = contentType.toLowerCase();
|
||||
|
||||
// Permit all plain text content types
|
||||
if (ctLc.startsWith("text/"))
|
||||
return true;
|
||||
// PDF
|
||||
else if (ctLc.startsWith("application/pdf"))
|
||||
return true;
|
||||
else if (ctLc.startsWith("x-marginalia/"))
|
||||
return true;
|
||||
|
||||
|
@@ -10,7 +10,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class ContentTypeLogic {
|
||||
|
||||
private static final Predicate<String> probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableGoodPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md|pdf)$").asMatchPredicate();
|
||||
private static final Predicate<String> probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate();
|
||||
private static final Set<String> blockedContentTypes = Set.of("text/css", "text/javascript");
|
||||
private static final List<String> acceptedContentTypePrefixes = List.of(
|
||||
@@ -22,6 +22,7 @@ public class ContentTypeLogic {
|
||||
"application/rss+xml",
|
||||
"application/x-rss+xml",
|
||||
"application/rdf+xml",
|
||||
"application/pdf",
|
||||
"x-rss+xml"
|
||||
);
|
||||
private boolean allowAllContentTypes = false;
|
||||
@@ -34,7 +35,7 @@ public class ContentTypeLogic {
|
||||
public boolean isUrlLikeBinary(EdgeUrl url) {
|
||||
String pathLowerCase = url.path.toLowerCase();
|
||||
|
||||
if (probableHtmlPattern.test(pathLowerCase))
|
||||
if (probableGoodPattern.test(pathLowerCase))
|
||||
return false;
|
||||
|
||||
return probableBinaryPattern.test(pathLowerCase);
|
||||
|
@@ -216,6 +216,11 @@ public record SlopCrawlDataRecord(String domain,
|
||||
return false;
|
||||
}
|
||||
|
||||
// If the format is binary, we don't want to translate it if the response is truncated
|
||||
if (response.truncated() != WarcTruncationReason.NOT_TRUNCATED && ContentTypes.isBinary(contentType)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -40,6 +40,8 @@ class HttpFetcherImplFetchTest {
|
||||
private static EdgeUrl badHttpStatusUrl;
|
||||
private static EdgeUrl keepAliveUrl;
|
||||
|
||||
private static EdgeUrl pdfUrl;
|
||||
|
||||
@BeforeAll
|
||||
public static void setupAll() throws URISyntaxException {
|
||||
wireMockServer =
|
||||
@@ -133,6 +135,13 @@ class HttpFetcherImplFetchTest {
|
||||
));
|
||||
|
||||
|
||||
pdfUrl = new EdgeUrl("http://localhost:18089/test.pdf");
|
||||
wireMockServer.stubFor(WireMock.get(WireMock.urlEqualTo(pdfUrl.path))
|
||||
.willReturn(WireMock.aResponse()
|
||||
.withHeader("Content-Type", "application/pdf")
|
||||
.withStatus(200)
|
||||
.withBody("Hello World")));
|
||||
|
||||
wireMockServer.start();
|
||||
|
||||
}
|
||||
@@ -352,6 +361,14 @@ class HttpFetcherImplFetchTest {
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPdf() {
|
||||
var result = fetcher.fetchContent(pdfUrl, warcRecorder, new DomainCookies(), new CrawlDelayTimer(1000), ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
|
||||
Assertions.assertInstanceOf(HttpFetchResult.ResultOk.class, result);
|
||||
Assertions.assertTrue(result.isOk());
|
||||
}
|
||||
|
||||
|
||||
private List<WarcRecord> getWarcRecords() throws IOException {
|
||||
List<WarcRecord> records = new ArrayList<>();
|
||||
|
@@ -4,9 +4,9 @@ import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.DomainCookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.slop.SlopCrawlDataRecord;
|
||||
import org.apache.hc.client5.http.classic.HttpClient;
|
||||
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
||||
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
||||
@@ -24,13 +24,14 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class WarcRecorderTest {
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
Path fileNameSlop;
|
||||
WarcRecorder client;
|
||||
|
||||
HttpClient httpClient;
|
||||
@@ -39,7 +40,7 @@ class WarcRecorderTest {
|
||||
httpClient = HttpClients.createDefault();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
fileNameSlop = Files.createTempFile("test", ".slop.zip");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
}
|
||||
@@ -159,17 +160,28 @@ class WarcRecorderTest {
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request3);
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
HttpGet request4 = new HttpGet("https://downloads.marginalia.nu/test.pdf");
|
||||
request4.addHeader("User-agent", "test.marginalia.nu");
|
||||
request4.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
client.fetch(httpClient, new DomainCookies(), request4);
|
||||
|
||||
SlopCrawlDataRecord.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
new UserAgent("test", "test"),
|
||||
fileNameWarc,
|
||||
fileNameParquet);
|
||||
fileNameSlop);
|
||||
|
||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||
assertEquals(2, urls.size());
|
||||
List<String> urls;
|
||||
try (var stream = SerializableCrawlDataStream.openDataStream(fileNameSlop)) {
|
||||
urls = stream.docsAsList().stream().map(doc -> doc.url.toString()).toList();
|
||||
}
|
||||
|
||||
assertEquals(3, urls.size());
|
||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||
// sanic.jpg gets filtered out for its bad mime type
|
||||
assertEquals("https://downloads.marginalia.nu/test.pdf", urls.get(2));
|
||||
|
||||
}
|
||||
|
||||
|
@@ -13,7 +13,7 @@
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
autofocus
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
@@ -21,7 +21,7 @@
|
||||
<input type="text"
|
||||
class="shadow-inner flex-1 dark:bg-black dark:text-gray-100 bg-gray-50 border dark:border-gray-600 border-gray-300 text-gray-900 text-sm rounded-sm block w-full p-2.5"
|
||||
value="${query}"
|
||||
placeholder="Search..."
|
||||
placeholder="Search the web!"
|
||||
autocomplete="off"
|
||||
name="query"
|
||||
id="searchInput" />
|
||||
|
@@ -24,25 +24,25 @@ This is a sample of real crawl data. It is intended for demo, testing and devel
|
||||
<tr>
|
||||
<td><input id="sample-s" value="sample-s" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-s">Small</label></td>
|
||||
<td>1000 Domains. About 2 GB. </td>
|
||||
<td>1000 Domains. About 1 GB. </td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-m" value="sample-m" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-m">Medium</label></td>
|
||||
<td>2000 Domains. About 6 GB. Recommended.</td>
|
||||
<td>2000 Domains. About 2 GB. Recommended.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-l" value="sample-l" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-l">Large</label></td>
|
||||
<td>5000 Domains. About 20 GB.</td>
|
||||
<td>5000 Domains. About 7 GB.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td><input id="sample-xl" value="sample-xl" name="sample" class="form-check-input" type="radio"></td>
|
||||
<td><label for="sample-xl">Huge</label></td>
|
||||
<td>50,000 Domains. Around 180 GB. Primarily intended for pre-production like testing environments.
|
||||
<td>50,000 Domains. Around 80 GB. Primarily intended for pre-production like testing environments.
|
||||
Expect hours of processing time. </td>
|
||||
</tr>
|
||||
</table>
|
||||
|
Reference in New Issue
Block a user