mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-05 21:22:39 +02:00
Compare commits
5 Commits
deploy-001
...
deploy-001
Author | SHA1 | Date | |
---|---|---|---|
|
0ea8092350 | ||
|
483d29497e | ||
|
bae44497fe | ||
|
0d59202aca | ||
|
0ca43f0c9c |
@@ -20,34 +20,11 @@ public record ContentTags(String etag, String lastMod) {
|
||||
public void paint(Request.Builder getBuilder) {
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.addHeader("If-None-Match", ifNoneMatch());
|
||||
getBuilder.addHeader("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.addHeader("If-Modified-Since", ifModifiedSince());
|
||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
|
||||
private String ifNoneMatch() {
|
||||
// Remove the W/ prefix if it exists
|
||||
|
||||
//'W/' (case-sensitive) indicates that a weak validator is used. Weak etags are
|
||||
// easy to generate, but are far less useful for comparisons. Strong validators
|
||||
// are ideal for comparisons but can be very difficult to generate efficiently.
|
||||
// Weak ETag values of two representations of the same resources might be semantically
|
||||
// equivalent, but not byte-for-byte identical. This means weak etags prevent caching
|
||||
// when byte range requests are used, but strong etags mean range requests can
|
||||
// still be cached.
|
||||
// - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
|
||||
|
||||
if (null != etag && etag.startsWith("W/")) {
|
||||
return etag.substring(2);
|
||||
} else {
|
||||
return etag;
|
||||
}
|
||||
}
|
||||
|
||||
private String ifModifiedSince() {
|
||||
return lastMod;
|
||||
}
|
||||
}
|
||||
|
@@ -34,8 +34,9 @@ import java.util.*;
|
||||
public class WarcRecorder implements AutoCloseable {
|
||||
/** Maximum time we'll wait on a single request */
|
||||
static final int MAX_TIME = 30_000;
|
||||
/** Maximum (decompressed) size we'll fetch */
|
||||
static final int MAX_SIZE = 1024 * 1024 * 10;
|
||||
|
||||
/** Maximum (decompressed) size we'll save */
|
||||
static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
|
||||
private final WarcWriter writer;
|
||||
private final Path warcFile;
|
||||
|
@@ -1,11 +1,15 @@
|
||||
package nu.marginalia.io;
|
||||
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/** Closable iterator exceptional over serialized crawl data
|
||||
* The data may appear in any order, and the iterator must be closed.
|
||||
@@ -26,6 +30,37 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
||||
@Nullable
|
||||
default Path path() { return null; }
|
||||
|
||||
/** For tests */
|
||||
default List<SerializableCrawlData> asList() throws IOException {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
while (hasNext()) {
|
||||
data.add(next());
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/** For tests */
|
||||
default List<CrawledDocument> docsAsList() throws IOException {
|
||||
List<CrawledDocument> data = new ArrayList<>();
|
||||
while (hasNext()) {
|
||||
if (next() instanceof CrawledDocument doc) {
|
||||
data.add(doc);
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/** For tests */
|
||||
default List<CrawledDomain> domainsAsList() throws IOException {
|
||||
List<CrawledDomain> data = new ArrayList<>();
|
||||
while (hasNext()) {
|
||||
if (next() instanceof CrawledDomain domain) {
|
||||
data.add(domain);
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// Dummy iterator over nothing
|
||||
static SerializableCrawlDataStream empty() {
|
||||
return new SerializableCrawlDataStream() {
|
||||
|
@@ -26,6 +26,7 @@ import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
@@ -47,6 +48,8 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
|
||||
private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024);
|
||||
|
||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||
DbDomainQueries domainQueries,
|
||||
DomainBlacklist domainBlacklist) {
|
||||
@@ -65,52 +68,68 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls));
|
||||
}
|
||||
|
||||
public void retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
|
||||
public int retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
|
||||
|
||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||
|
||||
List<EdgeUrl> relevantUrls = new ArrayList<>();
|
||||
|
||||
for (var url : urls) {
|
||||
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||
if (optParsedUrl.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||
continue;
|
||||
}
|
||||
relevantUrls.add(optParsedUrl.get());
|
||||
}
|
||||
|
||||
if (relevantUrls.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int fetched = 0;
|
||||
|
||||
try (HttpClient client = HttpClient
|
||||
.newBuilder()
|
||||
.connectTimeout(connectTimeout)
|
||||
.followRedirects(HttpClient.Redirect.NEVER)
|
||||
.version(HttpClient.Version.HTTP_2)
|
||||
.build();
|
||||
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) // throttle concurrent access per domain; do not remove
|
||||
// throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove:
|
||||
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain)
|
||||
) {
|
||||
|
||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||
|
||||
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
||||
|
||||
if (rules == null) { // I/O error fetching robots.txt
|
||||
// If we can't fetch the robots.txt,
|
||||
for (var url : urls) {
|
||||
lp.parseLink(rootUrl, url).ifPresent(this::maybeFlagAsBad);
|
||||
for (var url : relevantUrls) {
|
||||
maybeFlagAsBad(url);
|
||||
}
|
||||
return;
|
||||
return fetched;
|
||||
}
|
||||
|
||||
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
||||
|
||||
for (var url : urls) {
|
||||
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
||||
if (optParsedUrl.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
if (dataSet.hasUrl(optParsedUrl.get())) {
|
||||
continue;
|
||||
}
|
||||
for (var parsedUrl : relevantUrls) {
|
||||
|
||||
EdgeUrl parsedUrl = optParsedUrl.get();
|
||||
if (!rules.isAllowed(url)) {
|
||||
if (!rules.isAllowed(parsedUrl.toString())) {
|
||||
maybeFlagAsBad(parsedUrl);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
|
||||
-> dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> {
|
||||
dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||
fetched++;
|
||||
}
|
||||
case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fetched;
|
||||
}
|
||||
|
||||
private void maybeFlagAsBad(EdgeUrl url) {
|
||||
@@ -190,7 +209,7 @@ public class SimpleLinkScraper implements AutoCloseable {
|
||||
}
|
||||
|
||||
byte[] body = getResponseData(response);
|
||||
if (body.length > 1024 * 1024) {
|
||||
if (body.length > MAX_SIZE) {
|
||||
return new FetchResult.Error(parsedUrl);
|
||||
}
|
||||
|
||||
|
@@ -3,8 +3,8 @@ package nu.marginalia.livecrawler;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@@ -38,7 +38,8 @@ class SimpleLinkScraperTest {
|
||||
@Test
|
||||
public void testRetrieveNow() throws Exception {
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(1, fetched);
|
||||
|
||||
var streams = dataSet.getDataStreams();
|
||||
Assertions.assertEquals(1, streams.size());
|
||||
@@ -46,23 +47,20 @@ class SimpleLinkScraperTest {
|
||||
SerializableCrawlDataStream firstStream = streams.iterator().next();
|
||||
Assertions.assertTrue(firstStream.hasNext());
|
||||
|
||||
if (firstStream.next() instanceof CrawledDomain domain) {
|
||||
Assertions.assertEquals("www.marginalia.nu",domain.getDomain());
|
||||
}
|
||||
else {
|
||||
Assertions.fail();
|
||||
}
|
||||
List<CrawledDocument> documents = firstStream.docsAsList();
|
||||
Assertions.assertEquals(1, documents.size());
|
||||
Assertions.assertTrue(documents.getFirst().documentBody.startsWith("<!doctype"));
|
||||
}
|
||||
|
||||
Assertions.assertTrue(firstStream.hasNext());
|
||||
|
||||
if ((firstStream.next() instanceof CrawledDocument document)) {
|
||||
// verify we decompress the body string
|
||||
Assertions.assertTrue(document.documentBody.startsWith("<!doctype"));
|
||||
}
|
||||
else{
|
||||
Assertions.fail();
|
||||
}
|
||||
|
||||
Assertions.assertFalse(firstStream.hasNext());
|
||||
@Test
|
||||
public void testRetrieveNow_Redundant() throws Exception {
|
||||
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
||||
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||
|
||||
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
||||
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||
Assertions.assertEquals(0, fetched);
|
||||
}
|
||||
}
|
@@ -0,0 +1,14 @@
|
||||
<section id="frontpage-tips">
|
||||
<h2>Public Beta Available</h2>
|
||||
<div class="info">
|
||||
<p>
|
||||
A redesigned version of the search engine UI is available for beta testing.
|
||||
Feel free to give it a spin, feedback is welcome!
|
||||
The old one will also be keep being available if you hate it,
|
||||
or have compatibility issues.
|
||||
</p>
|
||||
<p>
|
||||
<a href="https://test.marginalia.nu/">Try it out!</a>
|
||||
</p>
|
||||
</div>
|
||||
</section>
|
@@ -24,7 +24,7 @@
|
||||
<section id="frontpage">
|
||||
{{>search/index/index-news}}
|
||||
{{>search/index/index-about}}
|
||||
{{>search/index/index-tips}}
|
||||
{{>search/index/index-redesign}}
|
||||
</section>
|
||||
|
||||
{{>search/parts/search-footer}}
|
||||
|
2
tools/deployment/deployment.py
Normal file → Executable file
2
tools/deployment/deployment.py
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from dataclasses import dataclass
|
||||
import subprocess, os
|
||||
from typing import List, Set, Dict, Optional
|
||||
|
Reference in New Issue
Block a user