mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
21 Commits
deploy-005
...
deploy-006
Author | SHA1 | Date | |
---|---|---|---|
|
579a115243 | ||
|
2c67f50a43 | ||
|
78a958e2b0 | ||
|
4e939389b2 | ||
|
e67a9bdb91 | ||
|
567e4e1237 | ||
|
4342e42722 | ||
|
bc818056e6 | ||
|
de2feac238 | ||
|
1e770205a5 | ||
|
e44ecd6d69 | ||
|
5b93a0e633 | ||
|
08fb0e5efe | ||
|
bcf67782ea | ||
|
ef3f175ede | ||
|
bbe4b5d9fd | ||
|
c67a635103 | ||
|
20b24133fb | ||
|
f2567677e8 | ||
|
bc2c2061f2 | ||
|
1c7f5a31a5 |
39
ROADMAP.md
39
ROADMAP.md
@@ -1,4 +1,4 @@
|
|||||||
# Roadmap 2024-2025
|
# Roadmap 2025
|
||||||
|
|
||||||
This is a roadmap with major features planned for Marginalia Search.
|
This is a roadmap with major features planned for Marginalia Search.
|
||||||
|
|
||||||
@@ -30,12 +30,6 @@ Retaining the ability to independently crawl the web is still strongly desirable
|
|||||||
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
## Web Design Overhaul
|
|
||||||
|
|
||||||
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
|
||||||
|
|
||||||
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -62,8 +56,31 @@ filter for any API consumer.
|
|||||||
|
|
||||||
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this.
|
||||||
|
|
||||||
|
## Show favicons next to search results
|
||||||
|
|
||||||
|
This is expected from search engines. Basic proof of concept sketch of fetching this data has been done, but the feature is some way from being reality.
|
||||||
|
|
||||||
|
## Specialized crawler for github
|
||||||
|
|
||||||
|
One of the search engine's biggest limitations right now is that it does not index github at all. A specialized crawler that fetches at least the readme.md would go a long way toward providing search capabilities in this domain.
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
|
|
||||||
|
## Web Design Overhaul (COMPLETED 2025-01)
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127)
|
||||||
|
|
||||||
|
## Finalize RSS support (COMPLETED 2024-11)
|
||||||
|
|
||||||
|
Marginalia has experimental RSS preview support for a few domains. This works well and
|
||||||
|
it should be extended to all domains. It would also be interesting to offer search of the
|
||||||
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
|
main dataset.
|
||||||
|
|
||||||
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Proper Position Index (COMPLETED 2024-09)
|
## Proper Position Index (COMPLETED 2024-09)
|
||||||
|
|
||||||
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
The search engine uses a fixed width bit mask to indicate word positions. It has the benefit
|
||||||
@@ -76,11 +93,3 @@ list, as is the civilized way of doing this.
|
|||||||
|
|
||||||
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Finalize RSS support (COMPLETED 2024-11)
|
|
||||||
|
|
||||||
Marginalia has experimental RSS preview support for a few domains. This works well and
|
|
||||||
it should be extended to all domains. It would also be interesting to offer search of the
|
|
||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
|
||||||
main dataset.
|
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
|
||||||
|
@@ -47,7 +47,7 @@ ext {
|
|||||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||||
dockerImageTag='latest'
|
dockerImageTag='latest'
|
||||||
dockerImageRegistry='marginalia'
|
dockerImageRegistry='marginalia'
|
||||||
jibVersion = '3.4.3'
|
jibVersion = '3.4.4'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -16,20 +16,19 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import static java.lang.Math.clamp;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexClient {
|
public class IndexClient {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
|
||||||
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
|
||||||
private final DomainBlacklistImpl blacklist;
|
private final DomainBlacklistImpl blacklist;
|
||||||
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
|
private static final ExecutorService executor = Executors.newCachedThreadPool();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) {
|
||||||
@@ -51,40 +50,37 @@ public class IndexClient {
|
|||||||
|
|
||||||
/** Execute a query on the index partitions and return the combined results. */
|
/** Execute a query on the index partitions and return the combined results. */
|
||||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
|
||||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
|
||||||
.async(executor)
|
|
||||||
.runEach(indexRequest);
|
|
||||||
|
|
||||||
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
final int requestedMaxResults = indexRequest.getQueryLimits().getResultsTotal();
|
||||||
final int resultsUpperBound = requestedMaxResults * channelPool.getNumNodes();
|
|
||||||
|
|
||||||
List<RpcDecoratedResultItem> results = new ArrayList<>(resultsUpperBound);
|
AtomicInteger totalNumResults = new AtomicInteger(0);
|
||||||
|
|
||||||
for (var future : futures) {
|
List<RpcDecoratedResultItem> results =
|
||||||
|
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||||
|
.async(executor)
|
||||||
|
.runEach(indexRequest)
|
||||||
|
.stream()
|
||||||
|
.map(future -> future.thenApply(iterator -> {
|
||||||
|
List<RpcDecoratedResultItem> ret = new ArrayList<>(requestedMaxResults);
|
||||||
|
iterator.forEachRemaining(ret::add);
|
||||||
|
totalNumResults.addAndGet(ret.size());
|
||||||
|
return ret;
|
||||||
|
}))
|
||||||
|
.mapMulti((CompletableFuture<List<RpcDecoratedResultItem>> fut, Consumer<List<RpcDecoratedResultItem>> c) ->{
|
||||||
try {
|
try {
|
||||||
future.get().forEachRemaining(results::add);
|
c.accept(fut.join());
|
||||||
}
|
} catch (Exception e) {
|
||||||
catch (Exception e) {
|
logger.error("Error while fetching results", e);
|
||||||
logger.error("Downstream exception", e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.filter(item -> !isBlacklisted(item))
|
||||||
|
.sorted(comparator)
|
||||||
|
.skip(Math.max(0, (pagination.page - 1) * pagination.pageSize))
|
||||||
|
.limit(pagination.pageSize)
|
||||||
|
.toList();
|
||||||
|
|
||||||
// Sort the results by ranking score and remove blacklisted domains
|
return new AggregateQueryResponse(results, pagination.page(), totalNumResults.get());
|
||||||
results.sort(comparator);
|
|
||||||
results.removeIf(this::isBlacklisted);
|
|
||||||
|
|
||||||
int numReceivedResults = results.size();
|
|
||||||
|
|
||||||
// pagination is typically 1-indexed, so we need to adjust the start and end indices
|
|
||||||
int indexStart = (pagination.page - 1) * pagination.pageSize;
|
|
||||||
int indexEnd = (pagination.page) * pagination.pageSize;
|
|
||||||
|
|
||||||
results = results.subList(
|
|
||||||
clamp(indexStart, 0, Math.max(0, results.size() - 1)), // from is inclusive, so subtract 1 from size()
|
|
||||||
clamp(indexEnd, 0, results.size()));
|
|
||||||
|
|
||||||
return new AggregateQueryResponse(results, pagination.page(), numReceivedResults);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
private boolean isBlacklisted(RpcDecoratedResultItem item) {
|
||||||
|
@@ -106,11 +106,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
var url = new EdgeUrl(warcResponse.target());
|
var url = new EdgeUrl(warcResponse.target());
|
||||||
if (!Objects.equals(url.getDomain(), domain)) {
|
return Objects.equals(url.getDomain(), domain);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Failed to process response", e);
|
logger.warn("Failed to process response", e);
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -200,23 +201,23 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlRobotsTxt() throws Exception {
|
public void crawlRobotsTxt() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
var specs = new CrawlerMain.CrawlSpecRecord("marginalia-search.com", 5,
|
||||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
List.of("https://marginalia-search.com/search?q=hello+world")
|
||||||
);
|
);
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
assertFalse(domain.doc.isEmpty());
|
assertFalse(domain.doc.isEmpty());
|
||||||
assertEquals("OK", domain.crawlerStatus);
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
assertEquals("search.marginalia.nu", domain.domain);
|
assertEquals("marginalia-search.com", domain.domain);
|
||||||
|
|
||||||
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||||
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
assertTrue(allUrls.contains("https://marginalia-search.com/search"), "We expect a record for entities that are forbidden");
|
||||||
|
|
||||||
var output = process();
|
var output = process();
|
||||||
|
|
||||||
assertNotNull(output);
|
assertNotNull(output);
|
||||||
assertFalse(output.documents.isEmpty());
|
assertFalse(output.documents.isEmpty());
|
||||||
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
assertEquals(new EdgeDomain("marginalia-search.com"), output.domain);
|
||||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
for (var doc : output.documents) {
|
for (var doc : output.documents) {
|
||||||
@@ -246,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName);
|
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
||||||
var db = new DomainStateDb(dbTempFile))
|
var db = new DomainStateDb(dbTempFile))
|
||||||
{
|
{
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
|
@@ -55,7 +55,6 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
implementation libs.crawlercommons
|
implementation libs.crawlercommons
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@@ -33,8 +33,6 @@ import nu.marginalia.service.module.DatabaseModule;
|
|||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import okhttp3.ConnectionPool;
|
|
||||||
import okhttp3.Dispatcher;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -85,6 +83,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public CrawlerMain(UserAgent userAgent,
|
public CrawlerMain(UserAgent userAgent,
|
||||||
|
HttpFetcherImpl httpFetcher,
|
||||||
ProcessHeartbeatImpl heartbeat,
|
ProcessHeartbeatImpl heartbeat,
|
||||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
@@ -98,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||||
|
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
|
this.fetcher = httpFetcher;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
@@ -111,10 +111,6 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Integer.getInteger("crawler.poolSize", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1);
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl(userAgent,
|
|
||||||
new Dispatcher(),
|
|
||||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Wait for the blacklist to be loaded before starting the crawl
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
blacklist.waitUntilLoaded();
|
blacklist.waitUntilLoaded();
|
||||||
@@ -132,6 +128,10 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||||
|
|
||||||
|
// Set the maximum number of connections to keep alive in the connection pool
|
||||||
|
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||||
|
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||||
|
|
||||||
// We don't want to use too much memory caching sessions for https
|
// We don't want to use too much memory caching sessions for https
|
||||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||||
|
|
||||||
@@ -364,9 +364,9 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||||
CrawlDataReference reference = getReference();
|
CrawlDataReference reference = getReference()
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// Resume the crawl if it was aborted
|
// Resume the crawl if it was aborted
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Request;
|
import java.net.http.HttpRequest;
|
||||||
|
|
||||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||||
public record ContentTags(String etag, String lastMod) {
|
public record ContentTags(String etag, String lastMod) {
|
||||||
@@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Paints the tags onto the request builder. */
|
/** Paints the tags onto the request builder. */
|
||||||
public void paint(Request.Builder getBuilder) {
|
public void paint(HttpRequest.Builder getBuilder) {
|
||||||
|
|
||||||
if (etag != null) {
|
if (etag != null) {
|
||||||
getBuilder.addHeader("If-None-Match", etag);
|
getBuilder.header("If-None-Match", etag);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastMod != null) {
|
if (lastMod != null) {
|
||||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
getBuilder.header("If-Modified-Since", lastMod);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -1,33 +1,14 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Cookie;
|
import java.io.IOException;
|
||||||
import okhttp3.CookieJar;
|
import java.net.CookieHandler;
|
||||||
import okhttp3.HttpUrl;
|
import java.net.URI;
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
public class Cookies {
|
public class Cookies extends CookieHandler {
|
||||||
final ThreadLocal<ConcurrentHashMap<String, List<Cookie>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||||
|
|
||||||
public CookieJar getJar() {
|
|
||||||
return new CookieJar() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
|
|
||||||
|
|
||||||
if (!cookies.isEmpty()) {
|
|
||||||
cookieJar.get().put(url.host(), cookies);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Cookie> loadForRequest(HttpUrl url) {
|
|
||||||
return cookieJar.get().getOrDefault(url.host(), Collections.emptyList());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
public void clear() {
|
||||||
cookieJar.get().clear();
|
cookieJar.get().clear();
|
||||||
@@ -38,6 +19,16 @@ public class Cookies {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getCookies() {
|
public List<String> getCookies() {
|
||||||
return cookieJar.get().values().stream().flatMap(List::stream).map(Cookie::toString).toList();
|
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||||
|
return cookieJar.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||||
|
cookieJar.get().putAll(responseHeaders);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -3,6 +3,7 @@ package nu.marginalia.crawl.fetcher;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@@ -11,10 +12,10 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ImplementedBy(HttpFetcherImpl.class)
|
@ImplementedBy(HttpFetcherImpl.class)
|
||||||
public interface HttpFetcher {
|
public interface HttpFetcher extends AutoCloseable {
|
||||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||||
|
|
||||||
List<String> getCookies();
|
Cookies getCookies();
|
||||||
void clearCookies();
|
void clearCookies();
|
||||||
|
|
||||||
DomainProbeResult probeDomain(EdgeUrl url);
|
DomainProbeResult probeDomain(EdgeUrl url);
|
||||||
@@ -27,7 +28,9 @@ public interface HttpFetcher {
|
|||||||
HttpFetchResult fetchContent(EdgeUrl url,
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder recorder,
|
WarcRecorder recorder,
|
||||||
ContentTags tags,
|
ContentTags tags,
|
||||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
|
ProbeType probeType) throws Exception;
|
||||||
|
|
||||||
|
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||||
|
|
||||||
|
@@ -1,35 +1,41 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import okhttp3.ConnectionPool;
|
import org.jsoup.Jsoup;
|
||||||
import okhttp3.Dispatcher;
|
import org.jsoup.nodes.Document;
|
||||||
import okhttp3.OkHttpClient;
|
import org.jsoup.parser.Parser;
|
||||||
import okhttp3.Request;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
import java.io.IOException;
|
||||||
import java.io.InterruptedIOException;
|
import java.io.InputStream;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.URLDecoder;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.net.http.HttpTimeoutException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Objects;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.Optional;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@@ -40,39 +46,28 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final OkHttpClient client;
|
private final HttpClient client;
|
||||||
|
|
||||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
private HttpClient createClient() {
|
||||||
|
return HttpClient.newBuilder()
|
||||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
.sslContext(NoSecuritySSL.buildSslContext())
|
||||||
var builder = new OkHttpClient.Builder();
|
.cookieHandler(cookies)
|
||||||
if (dispatcher != null) {
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
builder.dispatcher(dispatcher);
|
.connectTimeout(Duration.ofSeconds(8))
|
||||||
}
|
.executor(Executors.newCachedThreadPool())
|
||||||
|
|
||||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
|
||||||
.socketFactory(ftSocketFactory)
|
|
||||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.connectionPool(pool)
|
|
||||||
.cookieJar(cookies.getJar())
|
|
||||||
.followRedirects(true)
|
|
||||||
.followSslRedirects(true)
|
|
||||||
.connectTimeout(8, TimeUnit.SECONDS)
|
|
||||||
.readTimeout(10, TimeUnit.SECONDS)
|
|
||||||
.writeTimeout(10, TimeUnit.SECONDS)
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getCookies() {
|
public Cookies getCookies() {
|
||||||
return cookies.getCookies();
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -81,26 +76,24 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(UserAgent userAgent,
|
public HttpFetcherImpl(UserAgent userAgent)
|
||||||
Dispatcher dispatcher,
|
|
||||||
ConnectionPool connectionPool)
|
|
||||||
{
|
{
|
||||||
this.client = createClient(dispatcher, connectionPool);
|
this.client = createClient();
|
||||||
this.userAgentString = userAgent.uaString();
|
this.userAgentString = userAgent.uaString();
|
||||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
this.client = createClient(null, new ConnectionPool());
|
this.client = createClient();
|
||||||
this.userAgentString = userAgent;
|
this.userAgentString = userAgent;
|
||||||
this.userAgentIdentifier = userAgent;
|
this.userAgentIdentifier = userAgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Not necessary in prod, but useful in test
|
// Not necessary in prod, but useful in test
|
||||||
public void close() {
|
public void close() {
|
||||||
client.dispatcher().executorService().shutdown();
|
client.close();
|
||||||
client.connectionPool().evictAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
||||||
* and if there are any redirects. This is done by one or more HEAD requests.
|
* and if there are any redirects. This is done by one or more HEAD requests.
|
||||||
@@ -110,19 +103,26 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
HttpRequest head;
|
||||||
.url(url.toString())
|
try {
|
||||||
|
head = HttpRequest.newBuilder()
|
||||||
|
.HEAD()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout)
|
||||||
.build();
|
.build();
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
var call = client.newCall(head);
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
|
||||||
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
|
||||||
|
|
||||||
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
|
||||||
return new DomainProbeResult.Redirect(requestUrl.domain);
|
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(requestUrl);
|
|
||||||
|
try {
|
||||||
|
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||||
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
|
|
||||||
|
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||||
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
|
}
|
||||||
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
@@ -140,21 +140,25 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) throws RateLimitException {
|
||||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
var headBuilder = new Request.Builder().head()
|
|
||||||
.addHeader("User-agent", userAgentString)
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.url(url.toString());
|
|
||||||
|
|
||||||
var head = headBuilder.build();
|
try {
|
||||||
var call = client.newCall(head);
|
var headBuilder = HttpRequest.newBuilder()
|
||||||
|
.HEAD()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
;
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
var headers = rsp.headers();
|
||||||
|
|
||||||
|
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||||
|
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.code());
|
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
||||||
|
|
||||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
|
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||||
@@ -168,27 +172,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||||
|
|
||||||
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
var redirectUrl = new EdgeUrl(rsp.uri());
|
||||||
EdgeUrl ret;
|
EdgeUrl ret;
|
||||||
|
|
||||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||||
else ret = url;
|
else ret = url;
|
||||||
|
|
||||||
// Intercept rate limiting
|
// Intercept rate limiting
|
||||||
if (rsp.code() == 429) {
|
if (rsp.statusCode() == 429) {
|
||||||
throw new HttpFetcherImpl.RateLimitException(Objects.requireNonNullElse(rsp.header("Retry-After"), "1"));
|
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ContentTypeProbeResult.Ok(ret);
|
return new ContentTypeProbeResult.Ok(ret);
|
||||||
}
|
}
|
||||||
|
catch (HttpTimeoutException ex) {
|
||||||
|
warcRecorder.flagAsTimeout(url);
|
||||||
|
return new ContentTypeProbeResult.Timeout(ex);
|
||||||
|
}
|
||||||
catch (RateLimitException ex) {
|
catch (RateLimitException ex) {
|
||||||
throw ex;
|
throw ex;
|
||||||
}
|
}
|
||||||
catch (InterruptedIOException ex) {
|
catch (Exception ex) {
|
||||||
warcRecorder.flagAsTimeout(url);
|
|
||||||
|
|
||||||
return new ContentTypeProbeResult.Timeout(ex);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
|
||||||
warcRecorder.flagAsError(url, ex);
|
warcRecorder.flagAsError(url, ex);
|
||||||
@@ -210,13 +214,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
ProbeType probeType)
|
ProbeType probeType)
|
||||||
throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
var getBuilder = new Request.Builder().get();
|
var getBuilder = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
getBuilder.url(url.toString())
|
.uri(url.asURI())
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
.header("Accept-Encoding", "gzip")
|
||||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
.header("Accept-Language", "en,*;q=0.5")
|
||||||
.addHeader("User-agent", userAgentString);
|
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
;
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
@@ -242,6 +248,126 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||||
|
try {
|
||||||
|
List<EdgeUrl> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
Set<String> seenUrls = new HashSet<>();
|
||||||
|
Set<String> seenSitemaps = new HashSet<>();
|
||||||
|
|
||||||
|
Deque<EdgeUrl> sitemapQueue = new LinkedList<>();
|
||||||
|
|
||||||
|
EdgeUrl rootSitemapUrl = new EdgeUrl(root);
|
||||||
|
|
||||||
|
sitemapQueue.add(rootSitemapUrl);
|
||||||
|
|
||||||
|
int fetchedSitemaps = 0;
|
||||||
|
|
||||||
|
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||||
|
var head = sitemapQueue.removeFirst();
|
||||||
|
|
||||||
|
switch (fetchSitemap(head)) {
|
||||||
|
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
if (seenUrls.add(url)) {
|
||||||
|
EdgeUrl.parse(url)
|
||||||
|
.filter(u -> u.domain.equals(rootSitemapUrl.domain))
|
||||||
|
.ifPresent(ret::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
case SitemapResult.SitemapReferences(List<String> refs) -> {
|
||||||
|
for (var ref : refs) {
|
||||||
|
if (seenSitemaps.add(ref)) {
|
||||||
|
EdgeUrl.parse(ref)
|
||||||
|
.filter(url -> url.domain.equals(rootSitemapUrl.domain))
|
||||||
|
.ifPresent(sitemapQueue::addFirst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case SitemapResult.SitemapError() -> {}
|
||||||
|
}
|
||||||
|
|
||||||
|
delayTimer.waitFetchDelay();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error while fetching sitemaps via {}: {} ({})", root, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
|
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
|
.uri(sitemapUrl.asURI())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||||
|
if (response.statusCode() != 200) {
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (InputStream inputStream = response.body()) {
|
||||||
|
|
||||||
|
InputStream parserStream;
|
||||||
|
if (sitemapUrl.path.endsWith(".gz")) {
|
||||||
|
parserStream = new GZIPInputStream(inputStream);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
parserStream = inputStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||||
|
if (parsedSitemap.childrenSize() == 0) {
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
|
}
|
||||||
|
|
||||||
|
String rootTagName = parsedSitemap.child(0).tagName();
|
||||||
|
|
||||||
|
return switch (rootTagName.toLowerCase()) {
|
||||||
|
case "sitemapindex" -> {
|
||||||
|
List<String> references = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||||
|
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||||
|
}
|
||||||
|
case "urlset" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||||
|
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
case "rss", "atom" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("link, url")) {
|
||||||
|
urls.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
default -> new SitemapResult.SitemapError();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed interface SitemapResult {
|
||||||
|
record SitemapUrls(List<String> urls) implements SitemapResult {}
|
||||||
|
record SitemapReferences(List<String> sitemapRefs) implements SitemapResult {}
|
||||||
|
record SitemapError() implements SitemapResult {}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
|
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
|
||||||
@@ -257,14 +383,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try {
|
try {
|
||||||
var getBuilder = new Request.Builder().get();
|
var getRequest = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout);
|
||||||
|
|
||||||
getBuilder.url(url.toString())
|
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
|
||||||
.addHeader("User-agent", userAgentString);
|
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
|
||||||
|
|
||||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||||
robotsParser.parseContent(url.toString(),
|
robotsParser.parseContent(url.toString(),
|
||||||
|
@@ -1,31 +0,0 @@
|
|||||||
package nu.marginalia.crawl.fetcher.socket;
|
|
||||||
|
|
||||||
import okhttp3.Interceptor;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
/** An interceptor that intercepts network requests and adds the remote IP address as
|
|
||||||
* a header in the response. This is used to pass the remote IP address to the Warc
|
|
||||||
* writer, as this information is not available in the response.
|
|
||||||
*/
|
|
||||||
public class IpInterceptingNetworkInterceptor implements Interceptor {
|
|
||||||
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
@Override
|
|
||||||
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
|
|
||||||
String IP = chain.connection().socket().getInetAddress().getHostAddress();
|
|
||||||
|
|
||||||
return chain.proceed(chain.request())
|
|
||||||
.newBuilder()
|
|
||||||
.addHeader(pseudoHeaderName, IP)
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getIpFromResponse(Response response) {
|
|
||||||
return response.header(pseudoHeaderName);
|
|
||||||
}
|
|
||||||
}
|
|
@@ -27,7 +27,7 @@ public class NoSecuritySSL {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public static SSLSocketFactory buildSocketFactory() {
|
public static SSLContext buildSslContext() {
|
||||||
try {
|
try {
|
||||||
// Install the all-trusting trust manager
|
// Install the all-trusting trust manager
|
||||||
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
||||||
@@ -40,14 +40,11 @@ public class NoSecuritySSL {
|
|||||||
clientSessionContext.setSessionCacheSize(2048);
|
clientSessionContext.setSessionCacheSize(2048);
|
||||||
|
|
||||||
// Create a ssl socket factory with our all-trusting manager
|
// Create a ssl socket factory with our all-trusting manager
|
||||||
return sslContext.getSocketFactory();
|
return sslContext;
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HostnameVerifier buildHostnameVerifyer() {
|
|
||||||
return (hn, session) -> true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Headers;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Objects;
|
import java.util.Map;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/** Input buffer for temporary storage of a HTTP response
|
/** Input buffer for temporary storage of a HTTP response
|
||||||
@@ -17,8 +17,9 @@ import java.util.zip.GZIPInputStream;
|
|||||||
* */
|
* */
|
||||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
protected Headers headers;
|
protected HttpHeaders headers;
|
||||||
WarcInputBuffer(Headers headers) {
|
|
||||||
|
WarcInputBuffer(HttpHeaders headers) {
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -30,7 +31,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||||
|
|
||||||
public final Headers headers() { return headers; }
|
public final HttpHeaders headers() { return headers; }
|
||||||
|
|
||||||
/** Create a buffer for a response.
|
/** Create a buffer for a response.
|
||||||
* If the response is small and not compressed, it will be stored in memory.
|
* If the response is small and not compressed, it will be stored in memory.
|
||||||
@@ -38,26 +39,27 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(Response rsp) {
|
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
||||||
if (rsp == null)
|
if (rsp == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
try {
|
var headers = rsp.headers();
|
||||||
String contentLengthHeader = Objects.requireNonNullElse(rsp.header("Content-Length"), "-1");
|
|
||||||
int contentLength = Integer.parseInt(contentLengthHeader);
|
try (var is = rsp.body()) {
|
||||||
String contentEncoding = rsp.header("Content-Encoding");
|
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
||||||
|
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
||||||
|
|
||||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(rsp, contentLength);
|
return new MemoryBuffer(headers, is, contentLength);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(rsp);
|
return new FileBuffer(headers, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return new ErrorBuffer(rsp);
|
return new ErrorBuffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -99,12 +101,8 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
/** Pseudo-buffer for when we have an error */
|
/** Pseudo-buffer for when we have an error */
|
||||||
class ErrorBuffer extends WarcInputBuffer {
|
class ErrorBuffer extends WarcInputBuffer {
|
||||||
public ErrorBuffer() {
|
public ErrorBuffer() {
|
||||||
super(Headers.of());
|
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ErrorBuffer(Response rsp) {
|
|
||||||
super(rsp.headers());
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -125,12 +123,12 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(Response response, int size) {
|
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
||||||
super(response.headers());
|
super(headers);
|
||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(response.body().byteStream(), outputStream);
|
copy(responseStream, outputStream);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@@ -154,19 +152,15 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(Response response) throws IOException {
|
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(response.headers()));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
|
|
||||||
if (response.body() == null) {
|
|
||||||
truncationReason = WarcTruncationReason.DISCONNECT;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ("gzip".equals(response.header("Content-Encoding"))) {
|
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(new GZIPInputStream(response.body().byteStream()), out);
|
copy(new GZIPInputStream(responseStream), out);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
@@ -174,7 +168,7 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(response.body().byteStream(), out);
|
copy(responseStream, out);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
@@ -182,22 +176,13 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Headers suppressContentEncoding(Headers headers) {
|
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
||||||
var builder = new Headers.Builder();
|
return HttpHeaders.of(headers.map(), (k, v) -> {
|
||||||
|
|
||||||
headers.toMultimap().forEach((k, values) -> {
|
|
||||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
||||||
return;
|
return false;
|
||||||
}
|
|
||||||
if ("Transfer-Encoding".equalsIgnoreCase(k)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (var value : values) {
|
|
||||||
builder.add(k, value);
|
|
||||||
}
|
}
|
||||||
|
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
||||||
});
|
});
|
||||||
|
|
||||||
return builder.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,11 +1,12 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Protocol;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@@ -75,13 +76,13 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getResponseHeader(Response response, long size) {
|
static String getResponseHeader(HttpResponse<?> response, long size) {
|
||||||
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
|
String version = response.version() == HttpClient.Version.HTTP_1_1 ? "1.1" : "2.0";
|
||||||
|
|
||||||
String statusCode = String.valueOf(response.code());
|
String statusCode = String.valueOf(response.statusCode());
|
||||||
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
|
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.statusCode(), "Unknown");
|
||||||
|
|
||||||
String headerString = getHeadersAsString(response, size);
|
String headerString = getHeadersAsString(response.headers(), size);
|
||||||
|
|
||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
@@ -148,10 +149,10 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
static private String getHeadersAsString(Response response, long responseSize) {
|
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
response.headers().toMultimap().forEach((k, values) -> {
|
headers.map().forEach((k, values) -> {
|
||||||
String headerCapitalized = capitalizeHeader(k);
|
String headerCapitalized = capitalizeHeader(k);
|
||||||
|
|
||||||
// Omit pseudoheaders injected by the crawler itself
|
// Omit pseudoheaders injected by the crawler itself
|
||||||
@@ -179,8 +180,8 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
// okhttp gave us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
||||||
// for the WARC parser's sake...
|
// for the WARC parser's sake... (do we still need this, mr chesterton?)
|
||||||
static private String capitalizeHeader(String k) {
|
static private String capitalizeHeader(String k) {
|
||||||
return Arrays.stream(StringUtils.split(k, '-'))
|
return Arrays.stream(StringUtils.split(k, '-'))
|
||||||
.map(StringUtils::capitalize)
|
.map(StringUtils::capitalize)
|
||||||
|
@@ -1,13 +1,11 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -18,16 +16,18 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.time.Duration;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/** Based on JWarc's fetch method, APL 2.0 license
|
/** Based on JWarc's fetch method, APL 2.0 license
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
|
* This class wraps HttpClient and records the HTTP request and response in a WARC file,
|
||||||
* as best is possible given not all the data is available at the same time and needs to
|
* as best is possible given not all the data is available at the same time and needs to
|
||||||
* be reconstructed.
|
* be reconstructed.
|
||||||
*/
|
*/
|
||||||
@@ -47,20 +47,22 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
// Affix a version string in case we need to change the format in the future
|
// Affix a version string in case we need to change the format in the future
|
||||||
// in some way
|
// in some way
|
||||||
private final String warcRecorderVersion = "1.0";
|
private final String warcRecorderVersion = "1.0";
|
||||||
|
private final Cookies cookies;
|
||||||
// We need to know if the site uses cookies so this can be reported among the search results
|
|
||||||
// -- flip this to true if we see any cookies. This information will also be painted on any
|
|
||||||
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
|
|
||||||
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new WarcRecorder that will write to the given file
|
* Create a new WarcRecorder that will write to the given file
|
||||||
*
|
*
|
||||||
* @param warcFile The file to write to
|
* @param warcFile The file to write to
|
||||||
*/
|
*/
|
||||||
public WarcRecorder(Path warcFile) throws IOException {
|
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||||
this.warcFile = warcFile;
|
this.warcFile = warcFile;
|
||||||
this.writer = new WarcWriter(warcFile);
|
this.writer = new WarcWriter(warcFile);
|
||||||
|
this.cookies = fetcher.getCookies();
|
||||||
|
}
|
||||||
|
|
||||||
|
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
||||||
|
this.warcFile = warcFile;
|
||||||
|
this.writer = new WarcWriter(warcFile);
|
||||||
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -70,37 +72,41 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
public WarcRecorder() throws IOException {
|
public WarcRecorder() throws IOException {
|
||||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||||
this.writer = new WarcWriter(this.warcFile);
|
this.writer = new WarcWriter(this.warcFile);
|
||||||
|
this.cookies = new Cookies();
|
||||||
|
|
||||||
temporaryFile = true;
|
temporaryFile = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
IOException,
|
java.net.http.HttpRequest request)
|
||||||
URISyntaxException,
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
InterruptedException
|
|
||||||
{
|
{
|
||||||
URI requestUri = request.url().uri();
|
URI requestUri = request.uri();
|
||||||
|
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
String ip;
|
|
||||||
Instant date = Instant.now();
|
Instant date = Instant.now();
|
||||||
|
|
||||||
var call = client.newCall(request);
|
var response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||||
|
|
||||||
cookieInformation.update(client, request.url());
|
|
||||||
|
|
||||||
try (var response = call.execute();
|
Map<String, List<String>> extraHeaders = new HashMap<>();
|
||||||
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
|
||||||
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
|
extraHeaders.putAll(request.headers().map());
|
||||||
|
|
||||||
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
||||||
{
|
{
|
||||||
|
if (cookies.hasCookies()) {
|
||||||
|
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||||
|
}
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
InputStream inputStream = inputBuffer.read();
|
InputStream inputStream = inputBuffer.read();
|
||||||
|
|
||||||
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
|
||||||
|
|
||||||
responseDataBuffer.put(responseHeaders);
|
responseDataBuffer.put(responseHeaders);
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||||
|
|
||||||
@@ -123,17 +129,15 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
// It looks like this might be the same as requestUri, but it's not;
|
// It looks like this might be the same as requestUri, but it's not;
|
||||||
// it's the URI after resolving redirects.
|
// it's the URI after resolving redirects.
|
||||||
final URI responseUri = response.request().url().uri();
|
final URI responseUri = response.uri();
|
||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
.blockDigest(responseDigestBuilder.build())
|
.blockDigest(responseDigestBuilder.build())
|
||||||
.date(date)
|
.date(date)
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
cookieInformation.paint(responseBuilder);
|
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||||
|
responseBuilder.ipAddress(inetAddress);
|
||||||
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
|
|
||||||
|
|
||||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||||
|
|
||||||
@@ -150,8 +154,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
byte[] httpRequestString = WarcProtocolReconstructor
|
byte[] httpRequestString = WarcProtocolReconstructor
|
||||||
.getHttpRequestString(
|
.getHttpRequestString(
|
||||||
response.request().method(),
|
response.request().method(),
|
||||||
response.request().headers().toMultimap(),
|
response.request().headers().map(),
|
||||||
request.headers().toMultimap(),
|
extraHeaders,
|
||||||
requestUri)
|
requestUri)
|
||||||
.getBytes();
|
.getBytes();
|
||||||
|
|
||||||
@@ -167,10 +171,29 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it
|
||||||
writer.write(warcRequest);
|
writer.write(warcRequest);
|
||||||
|
|
||||||
|
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
|
&& inputBuffer.size() < 2048
|
||||||
|
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
|
{
|
||||||
|
// Fast detection and mitigation of crawler traps that respond with slow
|
||||||
|
// small responses, with a high branching factor
|
||||||
|
|
||||||
|
// Note we bail *after* writing the warc records, this will effectively only
|
||||||
|
// prevent link extraction from the document.
|
||||||
|
|
||||||
|
logger.warn("URL {} took too long to fetch ({}s) and was too small for the effort ({}b)",
|
||||||
|
requestUri,
|
||||||
|
Duration.between(date, Instant.now()).getSeconds(),
|
||||||
|
inputBuffer.size()
|
||||||
|
);
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultException(new IOException("Likely crawler trap"));
|
||||||
|
}
|
||||||
|
|
||||||
return new HttpFetchResult.ResultOk(responseUri,
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
response.code(),
|
response.statusCode(),
|
||||||
inputBuffer.headers(),
|
inputBuffer.headers(),
|
||||||
ip,
|
inetAddress.getHostAddress(),
|
||||||
responseDataBuffer.data,
|
responseDataBuffer.data,
|
||||||
dataStart,
|
dataStart,
|
||||||
responseDataBuffer.length() - dataStart);
|
responseDataBuffer.length() - dataStart);
|
||||||
@@ -246,7 +269,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
.date(Instant.now())
|
.date(Instant.now())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
cookieInformation.paint(builder);
|
if (cookies.hasCookies()) {
|
||||||
|
builder.addHeader("X-Has-Cookies", "1");
|
||||||
|
}
|
||||||
|
|
||||||
var reference = builder.build();
|
var reference = builder.build();
|
||||||
|
|
||||||
|
@@ -12,7 +12,6 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
|||||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -53,7 +52,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
private final SitemapFetcher sitemapFetcher;
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
@@ -71,7 +69,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
|
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
|
||||||
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
||||||
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
|
|
||||||
|
|
||||||
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
||||||
var fst = crawlFrontier.peek();
|
var fst = crawlFrontier.peek();
|
||||||
@@ -145,9 +142,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
// Add links from the sitemap to the crawl frontier
|
|
||||||
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
|
||||||
|
|
||||||
|
// Fetch sitemaps
|
||||||
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
|
}
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
@@ -271,10 +270,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download the sitemap if available
|
// Download the sitemap if available
|
||||||
if (feedLink.isPresent()) {
|
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||||
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
|
|
||||||
timer.waitFetchDelay(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
@@ -1,72 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival.sitemap;
|
|
||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
|
||||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
|
||||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class SitemapFetcher {
|
|
||||||
|
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
|
||||||
private final SitemapRetriever sitemapRetriever;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
|
|
||||||
|
|
||||||
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
|
|
||||||
this.crawlFrontier = crawlFrontier;
|
|
||||||
this.sitemapRetriever = sitemapRetriever;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
|
||||||
List<String> urls = robotsRules.getSitemaps();
|
|
||||||
|
|
||||||
if (urls.isEmpty()) {
|
|
||||||
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
downloadSitemaps(urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void downloadSitemaps(List<String> urls) {
|
|
||||||
|
|
||||||
Set<String> checkedSitemaps = new HashSet<>();
|
|
||||||
|
|
||||||
for (var rawUrl : urls) {
|
|
||||||
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
|
|
||||||
if (parsedUrl.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
EdgeUrl url = parsedUrl.get();
|
|
||||||
|
|
||||||
// Let's not download sitemaps from other domains for now
|
|
||||||
if (!crawlFrontier.isSameDomain(url)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkedSitemaps.contains(url.path))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var sitemap = sitemapRetriever.fetchSitemap(url);
|
|
||||||
if (sitemap.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure we don't try to download this sitemap again
|
|
||||||
// (don't move this up, as we may want to check the same
|
|
||||||
// path with different protocols until we find one that works)
|
|
||||||
|
|
||||||
checkedSitemaps.add(url.path);
|
|
||||||
|
|
||||||
crawlFrontier.addAllToQueue(sitemap);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Queue is now {}", crawlFrontier.queueSize());
|
|
||||||
}
|
|
||||||
}
|
|
@@ -36,7 +36,6 @@ dependencies {
|
|||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
@@ -1,17 +1,17 @@
|
|||||||
package nu.marginalia.model.body;
|
package nu.marginalia.model.body;
|
||||||
|
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import okhttp3.Headers;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.netpreserve.jwarc.MessageHeaders;
|
import org.netpreserve.jwarc.MessageHeaders;
|
||||||
import org.netpreserve.jwarc.WarcResponse;
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
@@ -56,42 +56,26 @@ public sealed interface HttpFetchResult {
|
|||||||
*/
|
*/
|
||||||
record ResultOk(URI uri,
|
record ResultOk(URI uri,
|
||||||
int statusCode,
|
int statusCode,
|
||||||
Headers headers,
|
HttpHeaders headers,
|
||||||
String ipAddress,
|
String ipAddress,
|
||||||
byte[] bytesRaw,
|
byte[] bytesRaw,
|
||||||
int bytesStart,
|
int bytesStart,
|
||||||
int bytesLength
|
int bytesLength
|
||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
|
|
||||||
|
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||||
|
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
return statusCode >= 200 && statusCode < 300;
|
return statusCode >= 200 && statusCode < 300;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ResultOk(URI uri,
|
|
||||||
int statusCode,
|
|
||||||
MessageHeaders headers,
|
|
||||||
String ipAddress,
|
|
||||||
byte[] bytesRaw,
|
|
||||||
int bytesStart,
|
|
||||||
int bytesLength) {
|
|
||||||
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Headers convertHeaders(MessageHeaders headers) {
|
|
||||||
var ret = new Headers.Builder();
|
|
||||||
for (var header : headers.map().entrySet()) {
|
|
||||||
for (var value : header.getValue()) {
|
|
||||||
ret.add(header.getKey(), value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public InputStream getInputStream() {
|
public InputStream getInputStream() {
|
||||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<Document> parseDocument() throws IOException {
|
public Optional<Document> parseDocument() {
|
||||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||||
if (contentType.is("text/html")) {
|
if (contentType.is("text/html")) {
|
||||||
return Optional.of(Jsoup.parse(body));
|
return Optional.of(Jsoup.parse(body));
|
||||||
@@ -102,8 +86,9 @@ public sealed interface HttpFetchResult {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
public String header(String name) {
|
public String header(String name) {
|
||||||
return headers.get(name);
|
return headers.firstValue(name).orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -165,27 +165,28 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
String headersStr = null;
|
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
for (var header : headers) {
|
for (var header : headers.map().entrySet()) {
|
||||||
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
|
for (var value : header.getValue()) {
|
||||||
|
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||||
}
|
}
|
||||||
headersStr = headersStrBuilder.toString();
|
}
|
||||||
|
String headersStr = headersStrBuilder.toString();
|
||||||
|
|
||||||
|
|
||||||
write(new CrawledDocumentParquetRecord(
|
write(new CrawledDocumentParquetRecord(
|
||||||
domain,
|
domain,
|
||||||
response.target(),
|
response.target(),
|
||||||
fetchOk.ipAddress(),
|
fetchOk.ipAddress(),
|
||||||
WarcXCookieInformationHeader.hasCookies(response),
|
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
||||||
fetchOk.statusCode(),
|
fetchOk.statusCode(),
|
||||||
response.date(),
|
response.date(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
headersStr,
|
headersStr,
|
||||||
headers.get("ETag"),
|
headers.firstValue("ETag").orElse(null),
|
||||||
headers.get("Last-Modified"))
|
headers.firstValue("Last-Modified").orElse(null)
|
||||||
);
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,35 +0,0 @@
|
|||||||
package org.netpreserve.jwarc;
|
|
||||||
|
|
||||||
import okhttp3.HttpUrl;
|
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
|
|
||||||
/** Encapsulates out-of-band information about whether a website uses cookies,
|
|
||||||
* using a non-standard WARC header "X-Has-Cookies".
|
|
||||||
*/
|
|
||||||
public class WarcXCookieInformationHeader {
|
|
||||||
private boolean hasCookies = false;
|
|
||||||
private static final String headerName = "X-Has-Cookies";
|
|
||||||
|
|
||||||
public void update(OkHttpClient client, HttpUrl url) {
|
|
||||||
if (!hasCookies) {
|
|
||||||
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasCookies() {
|
|
||||||
return hasCookies;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void paint(WarcResponse.Builder builder) {
|
|
||||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
|
||||||
}
|
|
||||||
public void paint(WarcXResponseReference.Builder builder) {
|
|
||||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean hasCookies(WarcRecord record) {
|
|
||||||
return record.headers().contains(headerName, "1");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@@ -1,11 +1,9 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -15,6 +13,8 @@ import org.netpreserve.jwarc.WarcResponse;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@@ -27,11 +27,10 @@ import static org.junit.jupiter.api.Assertions.fail;
|
|||||||
class CrawlerWarcResynchronizerTest {
|
class CrawlerWarcResynchronizerTest {
|
||||||
Path fileName;
|
Path fileName;
|
||||||
Path outputFile;
|
Path outputFile;
|
||||||
OkHttpClient httpClient;
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = new OkHttpClient.Builder()
|
httpClient = HttpClient.newBuilder()
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
fileName = Files.createTempFile("test", ".warc.gz");
|
fileName = Files.createTempFile("test", ".warc.gz");
|
||||||
@@ -46,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void run() throws IOException, URISyntaxException {
|
void run() throws IOException, URISyntaxException {
|
||||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||||
@@ -56,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||||
|
|
||||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
||||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -79,10 +78,11 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
var req = new Request.Builder().url(url)
|
var req = HttpRequest.newBuilder()
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.uri(new java.net.URI(url))
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.get().build();
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build();
|
||||||
recorder.fetch(httpClient, req);
|
recorder.fetch(httpClient, req);
|
||||||
}
|
}
|
||||||
}
|
}
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import com.sun.net.httpserver.HttpServer;
|
import com.sun.net.httpserver.HttpServer;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -79,7 +80,7 @@ class ContentTypeProberTest {
|
|||||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl("test");
|
fetcher = new HttpFetcherImpl("test");
|
||||||
recorder = new WarcRecorder(warcFile);
|
recorder = new WarcRecorder(warcFile, new Cookies());
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
|
@@ -2,13 +2,11 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -19,6 +17,8 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@@ -31,17 +31,16 @@ class WarcRecorderTest {
|
|||||||
Path fileNameWarc;
|
Path fileNameWarc;
|
||||||
Path fileNameParquet;
|
Path fileNameParquet;
|
||||||
WarcRecorder client;
|
WarcRecorder client;
|
||||||
OkHttpClient httpClient;
|
|
||||||
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = new OkHttpClient.Builder()
|
httpClient = HttpClient.newBuilder().build();
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
client = new WarcRecorder(fileNameWarc);
|
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -52,10 +51,13 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
client.fetch(httpClient,
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
HttpRequest.newBuilder()
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.get().build());
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build()
|
||||||
|
);
|
||||||
|
|
||||||
Map<String, String> sampleData = new HashMap<>();
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
@@ -76,7 +78,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -100,7 +102,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -112,7 +114,7 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSaveImport() throws URISyntaxException, IOException {
|
public void testSaveImport() throws URISyntaxException, IOException {
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@@ -136,19 +138,23 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.get().build());
|
.header("Accept-Encoding", "gzip")
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
|
.GET().build());
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
.get().build());
|
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.header("Accept-Encoding", "gzip")
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.GET().build());
|
||||||
.get().build());
|
|
||||||
client.close();
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
|
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||||
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build());
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
"www.marginalia.nu",
|
"www.marginalia.nu",
|
||||||
|
@@ -4,6 +4,7 @@ import nu.marginalia.crawl.fetcher.ContentTags;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
@@ -37,6 +38,12 @@ class HttpFetcherTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testSitemapMarginalia() {
|
||||||
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(1)).forEach(System.out::println);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchText() throws Exception {
|
void fetchText() throws Exception {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
@@ -3,11 +3,9 @@ package nu.marginalia.crawling.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.*;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@@ -17,7 +15,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
|||||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import okhttp3.Headers;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@@ -27,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@@ -122,7 +120,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getCookies() { return List.of();}
|
public Cookies getCookies() { return new Cookies();}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void clearCookies() {}
|
public void clearCookies() {}
|
||||||
@@ -149,7 +147,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new HttpFetchResult.ResultOk(
|
return new HttpFetchResult.ResultOk(
|
||||||
url.asURI(),
|
url.asURI(),
|
||||||
200,
|
200,
|
||||||
new Headers.Builder().build(),
|
HttpHeaders.of(Map.of(), (k,v)->true),
|
||||||
"127.0.0.1",
|
"127.0.0.1",
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
0,
|
0,
|
||||||
@@ -164,6 +162,11 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new HttpFetchResult.ResultNone();
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
return new SimpleRobotRules();
|
return new SimpleRobotRules();
|
||||||
@@ -174,5 +177,9 @@ public class CrawlerMockFetcherTest {
|
|||||||
return Mockito.mock(SitemapRetriever.class);
|
return Mockito.mock(SitemapRetriever.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -180,7 +181,7 @@ class CrawlerRetreiverTest {
|
|||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc2)
|
new WarcRecorder(tempFileWarc2, new Cookies())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@@ -458,7 +459,7 @@ class CrawlerRetreiverTest {
|
|||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
|
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc3)
|
new WarcRecorder(tempFileWarc3, new Cookies())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@@ -509,7 +510,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||||
@@ -522,7 +523,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||||
|
@@ -56,7 +56,6 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
implementation libs.crawlercommons
|
implementation libs.crawlercommons
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@@ -84,18 +84,33 @@ public record SearchParameters(WebsiteUrl url,
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String renderUrl() {
|
public String renderUrl() {
|
||||||
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
|
|
||||||
URLEncoder.encode(query, StandardCharsets.UTF_8),
|
|
||||||
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
|
|
||||||
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
|
|
||||||
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
|
|
||||||
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
|
|
||||||
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
|
|
||||||
Boolean.valueOf(newFilter).toString(),
|
|
||||||
page
|
|
||||||
);
|
|
||||||
|
|
||||||
return path;
|
StringBuilder pathBuilder = new StringBuilder("/search?");
|
||||||
|
pathBuilder.append("query=").append(URLEncoder.encode(query, StandardCharsets.UTF_8));
|
||||||
|
|
||||||
|
if (profile != SearchProfile.NO_FILTER) {
|
||||||
|
pathBuilder.append("&profile=").append(URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
if (js != SearchJsParameter.DEFAULT) {
|
||||||
|
pathBuilder.append("&js=").append(URLEncoder.encode(js.value, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
if (adtech != SearchAdtechParameter.DEFAULT) {
|
||||||
|
pathBuilder.append("&adtech=").append(URLEncoder.encode(adtech.value, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
if (recent != SearchRecentParameter.DEFAULT) {
|
||||||
|
pathBuilder.append("&recent=").append(URLEncoder.encode(recent.value, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
if (searchTitle != SearchTitleParameter.DEFAULT) {
|
||||||
|
pathBuilder.append("&searchTitle=").append(URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
if (page != 1) {
|
||||||
|
pathBuilder.append("&page=").append(page);
|
||||||
|
}
|
||||||
|
if (newFilter) {
|
||||||
|
pathBuilder.append("&newfilter=").append(Boolean.valueOf(newFilter).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return pathBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public RpcTemporalBias.Bias temporalBias() {
|
public RpcTemporalBias.Bias temporalBias() {
|
||||||
|
@@ -3,27 +3,22 @@ package nu.marginalia.search.command.commands;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import io.jooby.MapModelAndView;
|
import io.jooby.MapModelAndView;
|
||||||
import io.jooby.ModelAndView;
|
import io.jooby.ModelAndView;
|
||||||
import nu.marginalia.search.JteRenderer;
|
|
||||||
import nu.marginalia.search.SearchOperator;
|
import nu.marginalia.search.SearchOperator;
|
||||||
import nu.marginalia.search.command.SearchCommandInterface;
|
import nu.marginalia.search.command.SearchCommandInterface;
|
||||||
import nu.marginalia.search.command.SearchParameters;
|
import nu.marginalia.search.command.SearchParameters;
|
||||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||||
import nu.marginalia.search.model.NavbarModel;
|
import nu.marginalia.search.model.NavbarModel;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
public class SearchCommand implements SearchCommandInterface {
|
public class SearchCommand implements SearchCommandInterface {
|
||||||
private final SearchOperator searchOperator;
|
private final SearchOperator searchOperator;
|
||||||
private final JteRenderer jteRenderer;
|
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchCommand(SearchOperator searchOperator,
|
public SearchCommand(SearchOperator searchOperator){
|
||||||
JteRenderer jteRenderer) throws IOException {
|
|
||||||
this.searchOperator = searchOperator;
|
this.searchOperator = searchOperator;
|
||||||
this.jteRenderer = jteRenderer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@@ -137,7 +137,7 @@ public class SearchSiteInfoService {
|
|||||||
@PathParam String domainName,
|
@PathParam String domainName,
|
||||||
@QueryParam String view,
|
@QueryParam String view,
|
||||||
@QueryParam Integer page
|
@QueryParam Integer page
|
||||||
) throws SQLException {
|
) throws SQLException, ExecutionException {
|
||||||
|
|
||||||
if (null == domainName || domainName.isBlank()) {
|
if (null == domainName || domainName.isBlank()) {
|
||||||
return null;
|
return null;
|
||||||
|
@@ -36,10 +36,11 @@
|
|||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@if (filters.showRecentOption.isSet()) <input type="hidden" name="js" value="${filters.removeJsOption.value()}"> @endif
|
||||||
|
@if (filters.reduceAdtechOption.isSet()) <input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}"> @endif
|
||||||
|
@if (filters.searchTitleOption.isSet()) <input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}"> @endif
|
||||||
|
@if (filters.showRecentOption.isSet()) <input type="hidden" name="recent" value="${filters.showRecentOption.value()}"> @endif
|
||||||
|
|
||||||
<input type="hidden" name="js" value="${filters.removeJsOption.value()}">
|
|
||||||
<input type="hidden" name="adtech" value="${filters.reduceAdtechOption.value()}">
|
|
||||||
<input type="hidden" name="searchTitle" value="${filters.searchTitleOption.value()}">
|
|
||||||
<input type="hidden" name="profile" value="${profile}">
|
<input type="hidden" name="profile" value="${profile}">
|
||||||
<input type="hidden" name="recent" value="${filters.showRecentOption.value()}">
|
|
||||||
</form>
|
</form>
|
||||||
|
@@ -36,7 +36,7 @@
|
|||||||
<div class="text-slate-700 dark:text-white text-sm p-4">
|
<div class="text-slate-700 dark:text-white text-sm p-4">
|
||||||
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
<div class="fas fa-gift mr-1 text-margeblue dark:text-slate-200"></div>
|
||||||
This is the new design and home of Marginalia Search.
|
This is the new design and home of Marginalia Search.
|
||||||
You can about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
You can read about what this entails <a href="https://about.marginalia-search.com/article/redesign/" class="underline text-liteblue dark:text-blue-200">here</a>.
|
||||||
<p class="my-4"></p>
|
<p class="my-4"></p>
|
||||||
The old version of Marginalia Search remains available at
|
The old version of Marginalia Search remains available at
|
||||||
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
<a href="https://old-search.marginalia.nu/" class="underline text-liteblue dark:text-blue-200">https://old-search.marginalia.nu/</a>.
|
||||||
|
@@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
@@ -120,7 +121,7 @@ public class IntegrationTest {
|
|||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
/** CREATE WARC */
|
/** CREATE WARC */
|
||||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
||||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||||
|
|
||||||
|
@@ -72,11 +72,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -103,11 +103,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -129,11 +129,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
@@ -3,11 +3,11 @@ services:
|
|||||||
image: "mariadb:lts"
|
image: "mariadb:lts"
|
||||||
container_name: "mariadb"
|
container_name: "mariadb"
|
||||||
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
env_file: "${INSTALL_DIR}/env/mariadb.env"
|
||||||
command: ['mysqld', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
command: ['mariadbd', '--character-set-server=utf8mb4', '--collation-server=utf8mb4_unicode_ci']
|
||||||
ports:
|
ports:
|
||||||
- "127.0.0.1:3306:3306/tcp"
|
- "127.0.0.1:3306:3306/tcp"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: mysqladmin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
test: mariadb-admin ping -h 127.0.0.1 -u ${uval} --password=${pval}
|
||||||
start_period: 5s
|
start_period: 5s
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
Reference in New Issue
Block a user