mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
18 Commits
deploy-000
...
deploy-001
Author | SHA1 | Date | |
---|---|---|---|
|
3bc99639a0 | ||
|
927bc0b63c | ||
|
d968801dc1 | ||
|
89db69d360 | ||
|
895cee7004 | ||
|
4bb71b8439 | ||
|
e4a41f7dd1 | ||
|
69ad6287b1 | ||
|
41a59dcf45 | ||
|
94d4d2edb7 | ||
|
7ae19a92ba | ||
|
56d14e56d7 | ||
|
a557c7ae7f | ||
|
b66879ccb1 | ||
|
f1b7157ca2 | ||
|
7622335e84 | ||
|
0da2047eae | ||
|
5ee4321110 |
10
ROADMAP.md
10
ROADMAP.md
@@ -21,7 +21,7 @@ word n-grams known beforehand. This limits the ability to interpret longer quer
|
|||||||
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions
|
||||||
list, as is the civilized way of doing this.
|
list, as is the civilized way of doing this.
|
||||||
|
|
||||||
Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99
|
Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99)
|
||||||
|
|
||||||
## Hybridize crawler w/ Common Crawl data
|
## Hybridize crawler w/ Common Crawl data
|
||||||
|
|
||||||
@@ -41,6 +41,12 @@ The search engine has a bit of a problem showing spicy content mixed in with the
|
|||||||
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) )
|
||||||
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
combined with naive bayesian filter would go a long way, or something more sophisticated...?
|
||||||
|
|
||||||
|
## Web Design Overhaul
|
||||||
|
|
||||||
|
The design is kinda clunky and hard to maintain, and needlessly outdated-looking.
|
||||||
|
|
||||||
|
In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/
|
||||||
|
|
||||||
## Additional Language Support
|
## Additional Language Support
|
||||||
|
|
||||||
It would be desirable if the search engine supported more languages than English. This is partially about
|
It would be desirable if the search engine supported more languages than English. This is partially about
|
||||||
@@ -56,7 +62,7 @@ it should be extended to all domains. It would also be interesting to offer sea
|
|||||||
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
RSS data itself, or use the RSS set to feed a special live index that updates faster than the
|
||||||
main dataset.
|
main dataset.
|
||||||
|
|
||||||
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122)
|
Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125)
|
||||||
|
|
||||||
## Support for binary formats like PDF
|
## Support for binary formats like PDF
|
||||||
|
|
||||||
|
@@ -50,12 +50,18 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
|||||||
yield new Monitor("-");
|
yield new Monitor("-");
|
||||||
}
|
}
|
||||||
case Monitor(String feedsHash) -> {
|
case Monitor(String feedsHash) -> {
|
||||||
|
// Sleep initially in case this is during start-up
|
||||||
for (;;) {
|
for (;;) {
|
||||||
String currentHash = feedsClient.getFeedDataHash();
|
try {
|
||||||
if (!Objects.equals(currentHash, feedsHash)) {
|
Thread.sleep(Duration.ofMinutes(15));
|
||||||
yield new LiveCrawl(currentHash);
|
String currentHash = feedsClient.getFeedDataHash();
|
||||||
|
if (!Objects.equals(currentHash, feedsHash)) {
|
||||||
|
yield new LiveCrawl(currentHash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (RuntimeException ex) {
|
||||||
|
logger.error("Failed to fetch feed data hash");
|
||||||
}
|
}
|
||||||
Thread.sleep(Duration.ofMinutes(15));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> {
|
case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> {
|
||||||
|
@@ -59,12 +59,6 @@ public class FeedsClient {
|
|||||||
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
.forEachRemaining(rsp -> consumer.accept(rsp.getDomain(), new ArrayList<>(rsp.getUrlList())));
|
||||||
}
|
}
|
||||||
|
|
||||||
public record UpdatedDomain(String domain, List<String> urls) {
|
|
||||||
public UpdatedDomain(RpcUpdatedLinksResponse rsp) {
|
|
||||||
this(rsp.getDomain(), new ArrayList<>(rsp.getUrlList()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get the hash of the feed data, for identifying when the data has been updated */
|
/** Get the hash of the feed data, for identifying when the data has been updated */
|
||||||
public String getFeedDataHash() {
|
public String getFeedDataHash() {
|
||||||
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
return channelPool.call(FeedApiGrpc.FeedApiBlockingStub::getFeedDataHash)
|
||||||
|
@@ -46,6 +46,7 @@ message RpcFeed {
|
|||||||
string feedUrl = 3;
|
string feedUrl = 3;
|
||||||
string updated = 4;
|
string updated = 4;
|
||||||
repeated RpcFeedItem items = 5;
|
repeated RpcFeedItem items = 5;
|
||||||
|
int64 fetchTimestamp = 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
message RpcFeedItem {
|
message RpcFeedItem {
|
||||||
|
@@ -24,6 +24,7 @@ dependencies {
|
|||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.rssreader
|
implementation libs.rssreader
|
||||||
|
@@ -8,13 +8,16 @@ import nu.marginalia.rss.model.FeedDefinition;
|
|||||||
import nu.marginalia.rss.model.FeedItems;
|
import nu.marginalia.rss.model.FeedItems;
|
||||||
import nu.marginalia.service.module.ServiceConfiguration;
|
import nu.marginalia.service.module.ServiceConfiguration;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.nio.file.attribute.PosixFileAttributes;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.Base64;
|
import java.util.Base64;
|
||||||
@@ -125,6 +128,26 @@ public class FeedDb {
|
|||||||
return FeedItems.none();
|
return FeedItems.none();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public String getEtag(EdgeDomain domain) {
|
||||||
|
if (!feedDbEnabled) {
|
||||||
|
throw new IllegalStateException("Feed database is disabled on this node");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Capture the current reader to avoid concurrency issues
|
||||||
|
FeedDbReader reader = this.reader;
|
||||||
|
try {
|
||||||
|
if (reader != null) {
|
||||||
|
return reader.getEtag(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Error getting etag for " + domain, e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
public Optional<String> getFeedAsJson(String domain) {
|
public Optional<String> getFeedAsJson(String domain) {
|
||||||
if (!feedDbEnabled) {
|
if (!feedDbEnabled) {
|
||||||
throw new IllegalStateException("Feed database is disabled on this node");
|
throw new IllegalStateException("Feed database is disabled on this node");
|
||||||
@@ -209,4 +232,36 @@ public class FeedDb {
|
|||||||
|
|
||||||
reader.getLinksUpdatedSince(since, consumer);
|
reader.getLinksUpdatedSince(since, consumer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Instant getFetchTime() {
|
||||||
|
if (!Files.exists(readerDbPath)) {
|
||||||
|
return Instant.EPOCH;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Files.readAttributes(readerDbPath, PosixFileAttributes.class)
|
||||||
|
.creationTime()
|
||||||
|
.toInstant();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
logger.error("Failed to read the creatiom time of {}", readerDbPath);
|
||||||
|
return Instant.EPOCH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasData() {
|
||||||
|
if (!feedDbEnabled) {
|
||||||
|
throw new IllegalStateException("Feed database is disabled on this node");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Capture the current reader to avoid concurrency issues
|
||||||
|
FeedDbReader reader = this.reader;
|
||||||
|
|
||||||
|
if (reader != null) {
|
||||||
|
return reader.hasData();
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedItems;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.DriverManager;
|
import java.sql.DriverManager;
|
||||||
@@ -32,6 +33,7 @@ public class FeedDbReader implements AutoCloseable {
|
|||||||
try (var stmt = connection.createStatement()) {
|
try (var stmt = connection.createStatement()) {
|
||||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
||||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -106,6 +108,22 @@ public class FeedDbReader implements AutoCloseable {
|
|||||||
return FeedItems.none();
|
return FeedItems.none();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public String getEtag(EdgeDomain domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT etag FROM etags WHERE DOMAIN = ?")) {
|
||||||
|
stmt.setString(1, domain.toString());
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
|
||||||
|
if (rs.next()) {
|
||||||
|
return rs.getString(1);
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Error getting etag for " + domain, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private FeedItems deserialize(String string) {
|
private FeedItems deserialize(String string) {
|
||||||
return gson.fromJson(string, FeedItems.class);
|
return gson.fromJson(string, FeedItems.class);
|
||||||
}
|
}
|
||||||
@@ -141,4 +159,18 @@ public class FeedDbReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasData() {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT 1 FROM feed LIMIT 1")) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return rs.getBoolean(1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -20,6 +20,7 @@ public class FeedDbWriter implements AutoCloseable {
|
|||||||
private final Connection connection;
|
private final Connection connection;
|
||||||
private final PreparedStatement insertFeedStmt;
|
private final PreparedStatement insertFeedStmt;
|
||||||
private final PreparedStatement insertErrorStmt;
|
private final PreparedStatement insertErrorStmt;
|
||||||
|
private final PreparedStatement insertEtagStmt;
|
||||||
private final Path dbPath;
|
private final Path dbPath;
|
||||||
|
|
||||||
private volatile boolean closed = false;
|
private volatile boolean closed = false;
|
||||||
@@ -34,10 +35,12 @@ public class FeedDbWriter implements AutoCloseable {
|
|||||||
try (var stmt = connection.createStatement()) {
|
try (var stmt = connection.createStatement()) {
|
||||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
||||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
||||||
|
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
|
||||||
}
|
}
|
||||||
|
|
||||||
insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)");
|
insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)");
|
||||||
insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)");
|
insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)");
|
||||||
|
insertEtagStmt = connection.prepareStatement("INSERT INTO etags (domain, etag) VALUES (?, ?)");
|
||||||
}
|
}
|
||||||
|
|
||||||
public Path getDbPath() {
|
public Path getDbPath() {
|
||||||
@@ -56,6 +59,20 @@ public class FeedDbWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public synchronized void saveEtag(String domain, String etag) {
|
||||||
|
if (etag == null || etag.isBlank())
|
||||||
|
return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
insertEtagStmt.setString(1, domain.toLowerCase());
|
||||||
|
insertEtagStmt.setString(2, etag);
|
||||||
|
insertEtagStmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException e) {
|
||||||
|
logger.error("Error saving etag for " + domain, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public synchronized void setErrorCount(String domain, int count) {
|
public synchronized void setErrorCount(String domain, int count) {
|
||||||
try {
|
try {
|
||||||
insertErrorStmt.setString(1, domain);
|
insertErrorStmt.setString(1, domain);
|
||||||
|
@@ -5,6 +5,8 @@ import com.apptasticsoftware.rssreader.RssReader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
import nu.marginalia.executor.client.ExecutorClient;
|
import nu.marginalia.executor.client.ExecutorClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
@@ -32,13 +34,10 @@ import java.net.http.HttpRequest;
|
|||||||
import java.net.http.HttpResponse;
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Duration;
|
import java.time.*;
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.time.ZonedDateTime;
|
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.ThreadLocalRandom;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
@@ -60,7 +59,6 @@ public class FeedFetcherService {
|
|||||||
private final DomainLocks domainLocks = new DomainLocks();
|
private final DomainLocks domainLocks = new DomainLocks();
|
||||||
|
|
||||||
private volatile boolean updating;
|
private volatile boolean updating;
|
||||||
private boolean deterministic = false;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeedFetcherService(FeedDb feedDb,
|
public FeedFetcherService(FeedDb feedDb,
|
||||||
@@ -74,6 +72,17 @@ public class FeedFetcherService {
|
|||||||
this.nodeConfigurationService = nodeConfigurationService;
|
this.nodeConfigurationService = nodeConfigurationService;
|
||||||
this.serviceHeartbeat = serviceHeartbeat;
|
this.serviceHeartbeat = serviceHeartbeat;
|
||||||
this.executorClient = executorClient;
|
this.executorClient = executorClient;
|
||||||
|
|
||||||
|
|
||||||
|
// Add support for some alternate date tags for atom
|
||||||
|
rssReader.addItemExtension("issued", this::setDateFallback);
|
||||||
|
rssReader.addItemExtension("created", this::setDateFallback);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setDateFallback(Item item, String value) {
|
||||||
|
if (item.getPubDate().isEmpty()) {
|
||||||
|
item.setPubDate(value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public enum UpdateMode {
|
public enum UpdateMode {
|
||||||
@@ -81,11 +90,6 @@ public class FeedFetcherService {
|
|||||||
REFRESH
|
REFRESH
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Disable random-based heuristics. This is meant for testing */
|
|
||||||
public void setDeterministic() {
|
|
||||||
this.deterministic = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||||
if (updating) // Prevent concurrent updates
|
if (updating) // Prevent concurrent updates
|
||||||
{
|
{
|
||||||
@@ -124,51 +128,57 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
for (var feed : definitions) {
|
for (var feed : definitions) {
|
||||||
executor.submitQuietly(() -> {
|
executor.submitQuietly(() -> {
|
||||||
var oldData = feedDb.getFeed(new EdgeDomain(feed.domain()));
|
try {
|
||||||
|
EdgeDomain domain = new EdgeDomain(feed.domain());
|
||||||
|
var oldData = feedDb.getFeed(domain);
|
||||||
|
|
||||||
// If we have existing data, we might skip updating it with a probability that increases with time,
|
@Nullable
|
||||||
// this is to avoid hammering the feeds that are updated very rarely and save some time and resources
|
String ifModifiedSinceDate = switch(updateMode) {
|
||||||
// on our end
|
case REFRESH -> getIfModifiedSinceDate(feedDb);
|
||||||
|
case CLEAN -> null;
|
||||||
|
};
|
||||||
|
|
||||||
if (!oldData.isEmpty()) {
|
@Nullable
|
||||||
Duration duration = feed.durationSinceUpdated();
|
String ifNoneMatchTag = switch (updateMode) {
|
||||||
long daysSinceUpdate = duration.toDays();
|
case REFRESH -> feedDb.getEtag(domain);
|
||||||
|
case CLEAN -> null;
|
||||||
|
};
|
||||||
|
|
||||||
|
FetchResult feedData;
|
||||||
if (deterministic || (daysSinceUpdate > 2 && ThreadLocalRandom.current()
|
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||||
.nextInt(1, 1 + (int) Math.min(10, daysSinceUpdate) / 2) > 1))
|
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
||||||
{
|
} catch (Exception ex) {
|
||||||
// Skip updating this feed, just write the old data back instead
|
feedData = new FetchResult.TransientError();
|
||||||
writer.saveFeed(oldData);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
FetchResult feedData;
|
switch (feedData) {
|
||||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
case FetchResult.Success(String value, String etag) -> {
|
||||||
feedData = fetchFeedData(feed, client);
|
writer.saveEtag(feed.domain(), etag);
|
||||||
}
|
writer.saveFeed(parseFeed(value, feed));
|
||||||
catch (Exception ex) {
|
}
|
||||||
feedData = new FetchResult.TransientError();
|
case FetchResult.NotModified() -> {
|
||||||
}
|
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||||
|
|
||||||
switch (feedData) {
|
|
||||||
case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed));
|
|
||||||
case FetchResult.TransientError() -> {
|
|
||||||
int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0);
|
|
||||||
writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount);
|
|
||||||
|
|
||||||
if (errorCount < 5) {
|
|
||||||
// Permit the server a few days worth of retries before we drop the feed entirely
|
|
||||||
writer.saveFeed(oldData);
|
writer.saveFeed(oldData);
|
||||||
}
|
}
|
||||||
}
|
case FetchResult.TransientError() -> {
|
||||||
case FetchResult.PermanentError() -> {} // let the definition be forgotten about
|
int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0);
|
||||||
}
|
writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount);
|
||||||
|
|
||||||
if ((definitionsUpdated.incrementAndGet() % 1_000) == 0) {
|
if (errorCount < 5) {
|
||||||
// Update the progress every 1k feeds, to avoid hammering the database and flooding the logs
|
// Permit the server a few days worth of retries before we drop the feed entirely
|
||||||
heartbeat.progress("Updated " + definitionsUpdated + "/" + totalDefinitions + " feeds", definitionsUpdated.get(), totalDefinitions);
|
writer.saveFeed(oldData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case FetchResult.PermanentError() -> {
|
||||||
|
} // let the definition be forgotten about
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if ((definitionsUpdated.incrementAndGet() % 1_000) == 0) {
|
||||||
|
// Update the progress every 1k feeds, to avoid hammering the database and flooding the logs
|
||||||
|
heartbeat.progress("Updated " + definitionsUpdated + "/" + totalDefinitions + " feeds", definitionsUpdated.get(), totalDefinitions);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -196,30 +206,73 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private FetchResult fetchFeedData(FeedDefinition feed, HttpClient client) {
|
@Nullable
|
||||||
|
static String getIfModifiedSinceDate(FeedDb feedDb) {
|
||||||
|
|
||||||
|
// If the db is fresh, we don't send If-Modified-Since
|
||||||
|
if (!feedDb.hasData())
|
||||||
|
return null;
|
||||||
|
|
||||||
|
Instant cutoffInstant = feedDb.getFetchTime();
|
||||||
|
|
||||||
|
// If we're unable to establish fetch time, we don't send If-Modified-Since
|
||||||
|
if (cutoffInstant == Instant.EPOCH)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return cutoffInstant.atZone(ZoneId.of("GMT")).format(DateTimeFormatter.RFC_1123_DATE_TIME);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||||
|
HttpClient client,
|
||||||
|
@Nullable String ifModifiedSinceDate,
|
||||||
|
@Nullable String ifNoneMatchTag)
|
||||||
|
{
|
||||||
try {
|
try {
|
||||||
URI uri = new URI(feed.feedUrl());
|
URI uri = new URI(feed.feedUrl());
|
||||||
|
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(uri)
|
.uri(uri)
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.timeout(Duration.ofSeconds(15))
|
.timeout(Duration.ofSeconds(15))
|
||||||
.build();
|
;
|
||||||
|
|
||||||
|
if (ifModifiedSinceDate != null) {
|
||||||
|
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ifNoneMatchTag != null) {
|
||||||
|
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||||
|
}
|
||||||
|
|
||||||
|
HttpRequest getRequest = requestBuilder.build();
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofString());
|
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
if (429 == rs.statusCode()) {
|
|
||||||
|
if (rs.statusCode() == 429) { // Too Many Requests
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||||
} else if (200 == rs.statusCode()) {
|
continue;
|
||||||
return new FetchResult.Success(rs.body());
|
|
||||||
} else if (404 == rs.statusCode()) {
|
|
||||||
return new FetchResult.PermanentError(); // never try again
|
|
||||||
} else {
|
|
||||||
return new FetchResult.TransientError(); // we try again in a few days
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
||||||
|
|
||||||
|
return switch (rs.statusCode()) {
|
||||||
|
case 200 -> {
|
||||||
|
byte[] responseData = getResponseData(rs);
|
||||||
|
|
||||||
|
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||||
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||||
|
|
||||||
|
yield new FetchResult.Success(bodyText, newEtagValue);
|
||||||
|
}
|
||||||
|
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||||
|
case 404 -> new FetchResult.PermanentError(); // never try again
|
||||||
|
default -> new FetchResult.TransientError(); // we try again later
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@@ -229,8 +282,22 @@ public class FeedFetcherService {
|
|||||||
return new FetchResult.TransientError();
|
return new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||||
|
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||||
|
|
||||||
|
if ("gzip".equals(encoding)) {
|
||||||
|
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||||
|
return stream.readAllBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return response.body();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public sealed interface FetchResult {
|
public sealed interface FetchResult {
|
||||||
record Success(String value) implements FetchResult {}
|
record Success(String value, String etag) implements FetchResult {}
|
||||||
|
record NotModified() implements FetchResult {}
|
||||||
record TransientError() implements FetchResult {}
|
record TransientError() implements FetchResult {}
|
||||||
record PermanentError() implements FetchResult {}
|
record PermanentError() implements FetchResult {}
|
||||||
}
|
}
|
||||||
@@ -300,6 +367,8 @@ public class FeedFetcherService {
|
|||||||
|
|
||||||
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
public FeedItems parseFeed(String feedData, FeedDefinition definition) {
|
||||||
try {
|
try {
|
||||||
|
feedData = sanitizeEntities(feedData);
|
||||||
|
|
||||||
List<Item> rawItems = rssReader.read(
|
List<Item> rawItems = rssReader.read(
|
||||||
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
// Massage the data to maximize the possibility of the flaky XML parser consuming it
|
||||||
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false)
|
||||||
@@ -326,6 +395,32 @@ public class FeedFetcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final Map<String, String> HTML_ENTITIES = Map.of(
|
||||||
|
"»", "»",
|
||||||
|
"«", "«",
|
||||||
|
"—", "--",
|
||||||
|
"–", "-",
|
||||||
|
"’", "'",
|
||||||
|
"‘", "'",
|
||||||
|
" ", ""
|
||||||
|
);
|
||||||
|
|
||||||
|
/** The XML parser will blow up if you insert HTML entities in the feed XML,
|
||||||
|
* which is unfortunately relatively common. Replace them as far as is possible
|
||||||
|
* with their corresponding characters
|
||||||
|
*/
|
||||||
|
static String sanitizeEntities(String feedData) {
|
||||||
|
String result = feedData;
|
||||||
|
for (Map.Entry<String, String> entry : HTML_ENTITIES.entrySet()) {
|
||||||
|
result = result.replace(entry.getKey(), entry.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle lone ampersands not part of a recognized XML entity
|
||||||
|
result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&");
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
/** Decide whether to keep URI fragments in the feed items.
|
/** Decide whether to keep URI fragments in the feed items.
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* We keep fragments if there are multiple different fragments in the items.
|
* We keep fragments if there are multiple different fragments in the items.
|
||||||
@@ -361,7 +456,7 @@ public class FeedFetcherService {
|
|||||||
return seenFragments.size() > 1;
|
return seenFragments.size() > 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class IsFeedItemDateValid implements Predicate<FeedItem> {
|
static class IsFeedItemDateValid implements Predicate<FeedItem> {
|
||||||
private final String today = ZonedDateTime.now().format(DateTimeFormatter.ISO_ZONED_DATE_TIME);
|
private final String today = ZonedDateTime.now().format(DateTimeFormatter.ISO_ZONED_DATE_TIME);
|
||||||
|
|
||||||
public boolean test(FeedItem item) {
|
public boolean test(FeedItem item) {
|
||||||
|
@@ -107,8 +107,7 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void getFeed(RpcDomainId request,
|
public void getFeed(RpcDomainId request,
|
||||||
StreamObserver<RpcFeed> responseObserver)
|
StreamObserver<RpcFeed> responseObserver) {
|
||||||
{
|
|
||||||
if (!feedDb.isEnabled()) {
|
if (!feedDb.isEnabled()) {
|
||||||
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
responseObserver.onError(new IllegalStateException("Feed database is disabled on this node"));
|
||||||
return;
|
return;
|
||||||
@@ -126,7 +125,8 @@ public class FeedsGrpcService extends FeedApiGrpc.FeedApiImplBase implements Dis
|
|||||||
.setDomainId(request.getDomainId())
|
.setDomainId(request.getDomainId())
|
||||||
.setDomain(domainName.get().toString())
|
.setDomain(domainName.get().toString())
|
||||||
.setFeedUrl(feedItems.feedUrl())
|
.setFeedUrl(feedItems.feedUrl())
|
||||||
.setUpdated(feedItems.updated());
|
.setUpdated(feedItems.updated())
|
||||||
|
.setFetchTimestamp(feedDb.getFetchTime().toEpochMilli());
|
||||||
|
|
||||||
for (var item : feedItems.items()) {
|
for (var item : feedItems.items()) {
|
||||||
retB.addItemsBuilder()
|
retB.addItemsBuilder()
|
||||||
|
@@ -96,10 +96,31 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
feedDb.switchDb(writer);
|
feedDb.switchDb(writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
feedFetcherService.setDeterministic();
|
|
||||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
|
||||||
Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty());
|
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
System.out.println(result);
|
||||||
|
Assertions.assertFalse(result.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Tag("flaky")
|
||||||
|
@Test
|
||||||
|
public void testFetchRepeatedly() throws Exception {
|
||||||
|
try (var writer = feedDb.createWriter()) {
|
||||||
|
writer.saveFeed(new FeedItems("www.marginalia.nu", "https://www.marginalia.nu/log/index.xml", "", List.of()));
|
||||||
|
feedDb.switchDb(writer);
|
||||||
|
}
|
||||||
|
|
||||||
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||||
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||||
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||||
|
|
||||||
|
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||||
|
System.out.println(result);
|
||||||
|
Assertions.assertFalse(result.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Tag("flaky")
|
@Tag("flaky")
|
||||||
@@ -110,7 +131,6 @@ class FeedFetcherServiceTest extends AbstractModule {
|
|||||||
feedDb.switchDb(writer);
|
feedDb.switchDb(writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
feedFetcherService.setDeterministic();
|
|
||||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||||
|
|
||||||
// We forget the feed on a 404 error
|
// We forget the feed on a 404 error
|
||||||
|
@@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.rss.svc;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class TestXmlSanitization {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPreservedEntities() {
|
||||||
|
Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&"));
|
||||||
|
Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<"));
|
||||||
|
Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">"));
|
||||||
|
Assertions.assertEquals(""", FeedFetcherService.sanitizeEntities("""));
|
||||||
|
Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStrayAmpersand() {
|
||||||
|
Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTranslatedHtmlEntity() {
|
||||||
|
Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar"));
|
||||||
|
}
|
||||||
|
}
|
@@ -85,7 +85,7 @@ class BTreeWriterTest {
|
|||||||
public void testWriteEntrySize2() throws IOException {
|
public void testWriteEntrySize2() throws IOException {
|
||||||
BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64);
|
BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64);
|
||||||
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
var tempFile = Files.createTempFile("tst", "dat");
|
||||||
|
|
||||||
int[] data = generateItems32(64);
|
int[] data = generateItems32(64);
|
||||||
|
|
||||||
|
@@ -25,12 +25,11 @@ public class ProcessedDocumentDetails {
|
|||||||
|
|
||||||
public List<EdgeUrl> linksInternal;
|
public List<EdgeUrl> linksInternal;
|
||||||
public List<EdgeUrl> linksExternal;
|
public List<EdgeUrl> linksExternal;
|
||||||
public List<EdgeUrl> feedLinks;
|
|
||||||
|
|
||||||
public DocumentMetadata metadata;
|
public DocumentMetadata metadata;
|
||||||
public GeneratorType generator;
|
public GeneratorType generator;
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", feedLinks=" + this.feedLinks + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -34,7 +34,6 @@ public class LinkProcessor {
|
|||||||
|
|
||||||
ret.linksExternal = new ArrayList<>();
|
ret.linksExternal = new ArrayList<>();
|
||||||
ret.linksInternal = new ArrayList<>();
|
ret.linksInternal = new ArrayList<>();
|
||||||
ret.feedLinks = new ArrayList<>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<EdgeUrl> getSeenUrls() {
|
public Set<EdgeUrl> getSeenUrls() {
|
||||||
@@ -72,19 +71,6 @@ public class LinkProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Accepts a link as a feed link */
|
|
||||||
public void acceptFeed(EdgeUrl link) {
|
|
||||||
if (!isLinkPermitted(link)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!seenUrls.add(link)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.feedLinks.add(link);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isLinkPermitted(EdgeUrl link) {
|
private boolean isLinkPermitted(EdgeUrl link) {
|
||||||
if (!permittedSchemas.contains(link.proto.toLowerCase())) {
|
if (!permittedSchemas.contains(link.proto.toLowerCase())) {
|
||||||
return false;
|
return false;
|
||||||
|
@@ -294,11 +294,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
for (var meta : doc.select("meta[http-equiv=refresh]")) {
|
for (var meta : doc.select("meta[http-equiv=refresh]")) {
|
||||||
linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept);
|
linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
for (var link : doc.select("link[rel=alternate]")) {
|
|
||||||
feedExtractor
|
|
||||||
.getFeedFromAlternateTag(baseUrl, link)
|
|
||||||
.ifPresent(lp::acceptFeed);
|
|
||||||
}
|
|
||||||
|
|
||||||
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
|
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
|
||||||
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
|
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
|
||||||
|
@@ -125,7 +125,6 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
/* These are assumed to be populated */
|
/* These are assumed to be populated */
|
||||||
ret.linksInternal = new ArrayList<>();
|
ret.linksInternal = new ArrayList<>();
|
||||||
ret.linksExternal = new ArrayList<>();
|
ret.linksExternal = new ArrayList<>();
|
||||||
ret.feedLinks = new ArrayList<>();
|
|
||||||
|
|
||||||
return new DetailsWithWords(ret, words);
|
return new DetailsWithWords(ret, words);
|
||||||
}
|
}
|
||||||
|
@@ -166,7 +166,6 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
ret.details.length = 128;
|
ret.details.length = 128;
|
||||||
|
|
||||||
ret.details.standard = HtmlStandard.HTML5;
|
ret.details.standard = HtmlStandard.HTML5;
|
||||||
ret.details.feedLinks = List.of();
|
|
||||||
ret.details.linksExternal = List.of();
|
ret.details.linksExternal = List.of();
|
||||||
ret.details.linksInternal = List.of();
|
ret.details.linksInternal = List.of();
|
||||||
ret.state = UrlIndexingState.OK;
|
ret.state = UrlIndexingState.OK;
|
||||||
|
@@ -178,7 +178,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
public void writeDomainData(ProcessedDomain domain) throws IOException {
|
public void writeDomainData(ProcessedDomain domain) throws IOException {
|
||||||
DomainMetadata metadata = DomainMetadata.from(domain);
|
DomainMetadata metadata = DomainMetadata.from(domain);
|
||||||
|
|
||||||
List<String> feeds = getFeedUrls(domain);
|
|
||||||
|
|
||||||
domainWriter.write(
|
domainWriter.write(
|
||||||
new SlopDomainRecord(
|
new SlopDomainRecord(
|
||||||
@@ -188,25 +187,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
|||||||
metadata.visited(),
|
metadata.visited(),
|
||||||
Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(""),
|
Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(""),
|
||||||
Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(""),
|
Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(""),
|
||||||
domain.ip,
|
domain.ip
|
||||||
feeds
|
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getFeedUrls(ProcessedDomain domain) {
|
|
||||||
var documents = domain.documents;
|
|
||||||
if (documents == null)
|
|
||||||
return List.of();
|
|
||||||
|
|
||||||
return documents.stream().map(doc -> doc.details)
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.flatMap(dets -> dets.feedLinks.stream())
|
|
||||||
.distinct()
|
|
||||||
.map(EdgeUrl::toString)
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
domainWriter.close();
|
domainWriter.close();
|
||||||
documentWriter.close();
|
documentWriter.close();
|
||||||
|
@@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.model.processed;
|
package nu.marginalia.model.processed;
|
||||||
|
|
||||||
import nu.marginalia.slop.SlopTable;
|
import nu.marginalia.slop.SlopTable;
|
||||||
import nu.marginalia.slop.column.array.ObjectArrayColumn;
|
|
||||||
import nu.marginalia.slop.column.primitive.IntColumn;
|
import nu.marginalia.slop.column.primitive.IntColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumn;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||||
@@ -10,7 +9,6 @@ import nu.marginalia.slop.desc.StorageType;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public record SlopDomainRecord(
|
public record SlopDomainRecord(
|
||||||
@@ -20,8 +18,7 @@ public record SlopDomainRecord(
|
|||||||
int visitedUrls,
|
int visitedUrls,
|
||||||
String state,
|
String state,
|
||||||
String redirectDomain,
|
String redirectDomain,
|
||||||
String ip,
|
String ip)
|
||||||
List<String> rssFeeds)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
public record DomainWithIpProjection(
|
public record DomainWithIpProjection(
|
||||||
@@ -38,9 +35,6 @@ public record SlopDomainRecord(
|
|||||||
private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN);
|
private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN);
|
||||||
private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN);
|
private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN);
|
||||||
|
|
||||||
private static final ObjectArrayColumn<String> rssFeedsColumn = new TxtStringColumn("rssFeeds", StandardCharsets.UTF_8, StorageType.GZIP).asArray();
|
|
||||||
|
|
||||||
|
|
||||||
public static class DomainNameReader extends SlopTable {
|
public static class DomainNameReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final TxtStringColumn.Reader domainsReader;
|
||||||
|
|
||||||
@@ -101,8 +95,6 @@ public record SlopDomainRecord(
|
|||||||
private final IntColumn.Reader goodUrlsReader;
|
private final IntColumn.Reader goodUrlsReader;
|
||||||
private final IntColumn.Reader visitedUrlsReader;
|
private final IntColumn.Reader visitedUrlsReader;
|
||||||
|
|
||||||
private final ObjectArrayColumn<String>.Reader rssFeedsReader;
|
|
||||||
|
|
||||||
public Reader(SlopTable.Ref<SlopDomainRecord> ref) throws IOException {
|
public Reader(SlopTable.Ref<SlopDomainRecord> ref) throws IOException {
|
||||||
super(ref);
|
super(ref);
|
||||||
|
|
||||||
@@ -114,8 +106,6 @@ public record SlopDomainRecord(
|
|||||||
knownUrlsReader = knownUrlsColumn.open(this);
|
knownUrlsReader = knownUrlsColumn.open(this);
|
||||||
goodUrlsReader = goodUrlsColumn.open(this);
|
goodUrlsReader = goodUrlsColumn.open(this);
|
||||||
visitedUrlsReader = visitedUrlsColumn.open(this);
|
visitedUrlsReader = visitedUrlsColumn.open(this);
|
||||||
|
|
||||||
rssFeedsReader = rssFeedsColumn.open(this);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Reader(Path baseDir, int page) throws IOException {
|
public Reader(Path baseDir, int page) throws IOException {
|
||||||
@@ -140,8 +130,7 @@ public record SlopDomainRecord(
|
|||||||
visitedUrlsReader.get(),
|
visitedUrlsReader.get(),
|
||||||
statesReader.get(),
|
statesReader.get(),
|
||||||
redirectReader.get(),
|
redirectReader.get(),
|
||||||
ipReader.get(),
|
ipReader.get()
|
||||||
rssFeedsReader.get()
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -156,8 +145,6 @@ public record SlopDomainRecord(
|
|||||||
private final IntColumn.Writer goodUrlsWriter;
|
private final IntColumn.Writer goodUrlsWriter;
|
||||||
private final IntColumn.Writer visitedUrlsWriter;
|
private final IntColumn.Writer visitedUrlsWriter;
|
||||||
|
|
||||||
private final ObjectArrayColumn<String>.Writer rssFeedsWriter;
|
|
||||||
|
|
||||||
public Writer(Path baseDir, int page) throws IOException {
|
public Writer(Path baseDir, int page) throws IOException {
|
||||||
super(baseDir, page);
|
super(baseDir, page);
|
||||||
|
|
||||||
@@ -169,8 +156,6 @@ public record SlopDomainRecord(
|
|||||||
knownUrlsWriter = knownUrlsColumn.create(this);
|
knownUrlsWriter = knownUrlsColumn.create(this);
|
||||||
goodUrlsWriter = goodUrlsColumn.create(this);
|
goodUrlsWriter = goodUrlsColumn.create(this);
|
||||||
visitedUrlsWriter = visitedUrlsColumn.create(this);
|
visitedUrlsWriter = visitedUrlsColumn.create(this);
|
||||||
|
|
||||||
rssFeedsWriter = rssFeedsColumn.create(this);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(SlopDomainRecord record) throws IOException {
|
public void write(SlopDomainRecord record) throws IOException {
|
||||||
@@ -182,8 +167,6 @@ public record SlopDomainRecord(
|
|||||||
knownUrlsWriter.put(record.knownUrls());
|
knownUrlsWriter.put(record.knownUrls());
|
||||||
goodUrlsWriter.put(record.goodUrls());
|
goodUrlsWriter.put(record.goodUrls());
|
||||||
visitedUrlsWriter.put(record.visitedUrls());
|
visitedUrlsWriter.put(record.visitedUrls());
|
||||||
|
|
||||||
rssFeedsWriter.put(record.rssFeeds());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -9,7 +9,6 @@ import org.junit.jupiter.api.Test;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
@@ -35,8 +34,7 @@ public class SlopDomainRecordTest {
|
|||||||
1, 2, 3,
|
1, 2, 3,
|
||||||
"state",
|
"state",
|
||||||
"redirectDomain",
|
"redirectDomain",
|
||||||
"192.168.0.1",
|
"192.168.0.1"
|
||||||
List.of("rss1", "rss2")
|
|
||||||
);
|
);
|
||||||
|
|
||||||
try (var writer = new SlopDomainRecord.Writer(testDir, 0)) {
|
try (var writer = new SlopDomainRecord.Writer(testDir, 0)) {
|
||||||
|
@@ -7,6 +7,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -46,6 +47,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
private Path fileName;
|
private Path fileName;
|
||||||
private Path fileName2;
|
private Path fileName2;
|
||||||
|
private Path dbTempFile;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUpAll() {
|
public static void setUpAll() {
|
||||||
@@ -63,16 +65,18 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
||||||
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
|
this.dbTempFile = Files.createTempFile("domains", "db");
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
public void tearDown() throws IOException {
|
public void tearDown() throws IOException {
|
||||||
Files.deleteIfExists(fileName);
|
Files.deleteIfExists(fileName);
|
||||||
Files.deleteIfExists(fileName2);
|
Files.deleteIfExists(fileName2);
|
||||||
|
Files.deleteIfExists(dbTempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInvalidDomain() throws IOException {
|
public void testInvalidDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
||||||
|
|
||||||
@@ -88,7 +92,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRedirectingDomain() throws IOException {
|
public void testRedirectingDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
|
||||||
|
|
||||||
@@ -107,7 +111,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBlockedDomain() throws IOException {
|
public void testBlockedDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
|
||||||
|
|
||||||
@@ -124,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlSunnyDay() throws IOException {
|
public void crawlSunnyDay() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
@@ -157,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlContentTypes() throws IOException {
|
public void crawlContentTypes() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
|
||||||
List.of(
|
List.of(
|
||||||
"https://www.marginalia.nu/sanic.png",
|
"https://www.marginalia.nu/sanic.png",
|
||||||
@@ -195,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlRobotsTxt() throws IOException {
|
public void crawlRobotsTxt() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
||||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
List.of("https://search.marginalia.nu/search?q=hello+world")
|
||||||
);
|
);
|
||||||
@@ -235,15 +239,17 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
return null; // unreachable
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws Exception {
|
||||||
return crawl(specs, domain -> true);
|
return crawl(specs, domain -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName)) {
|
try (var recorder = new WarcRecorder(fileName);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
|
var db = new DomainStateDb(dbTempFile))
|
||||||
|
{
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(),
|
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(),
|
||||||
|
@@ -46,6 +46,8 @@ dependencies {
|
|||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
implementation libs.sqlite
|
||||||
|
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
|
@@ -241,6 +241,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
// Set up the work log and the warc archiver so we can keep track of what we've done
|
// Set up the work log and the warc archiver so we can keep track of what we've done
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
||||||
|
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
|
||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
|
||||||
) {
|
) {
|
||||||
@@ -258,6 +259,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
anchorTagsSource,
|
anchorTagsSource,
|
||||||
outputDir,
|
outputDir,
|
||||||
warcArchiver,
|
warcArchiver,
|
||||||
|
domainStateDb,
|
||||||
workLog);
|
workLog);
|
||||||
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||||
@@ -299,11 +301,12 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
|
||||||
|
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
|
||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
||||||
) {
|
) {
|
||||||
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
|
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
|
||||||
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
|
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
|
||||||
task.run();
|
task.run();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@@ -324,18 +327,21 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
private final Path outputDir;
|
private final Path outputDir;
|
||||||
private final WarcArchiverIf warcArchiver;
|
private final WarcArchiverIf warcArchiver;
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
private final WorkLog workLog;
|
private final WorkLog workLog;
|
||||||
|
|
||||||
CrawlTask(CrawlSpecRecord specification,
|
CrawlTask(CrawlSpecRecord specification,
|
||||||
AnchorTagsSource anchorTagsSource,
|
AnchorTagsSource anchorTagsSource,
|
||||||
Path outputDir,
|
Path outputDir,
|
||||||
WarcArchiverIf warcArchiver,
|
WarcArchiverIf warcArchiver,
|
||||||
|
DomainStateDb domainStateDb,
|
||||||
WorkLog workLog)
|
WorkLog workLog)
|
||||||
{
|
{
|
||||||
this.specification = specification;
|
this.specification = specification;
|
||||||
this.anchorTagsSource = anchorTagsSource;
|
this.anchorTagsSource = anchorTagsSource;
|
||||||
this.outputDir = outputDir;
|
this.outputDir = outputDir;
|
||||||
this.warcArchiver = warcArchiver;
|
this.warcArchiver = warcArchiver;
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
this.workLog = workLog;
|
this.workLog = workLog;
|
||||||
|
|
||||||
this.domain = specification.domain();
|
this.domain = specification.domain();
|
||||||
@@ -359,7 +365,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||||
CrawlDataReference reference = getReference();
|
CrawlDataReference reference = getReference();
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
@@ -0,0 +1,127 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/** Supplemental sqlite database for storing the summary of a crawl.
|
||||||
|
* One database exists per crawl data set.
|
||||||
|
* */
|
||||||
|
public class DomainStateDb implements AutoCloseable {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomainStateDb.class);
|
||||||
|
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public record SummaryRecord(
|
||||||
|
String domainName,
|
||||||
|
Instant lastUpdated,
|
||||||
|
String state,
|
||||||
|
@Nullable String stateDesc,
|
||||||
|
@Nullable String feedUrl
|
||||||
|
)
|
||||||
|
{
|
||||||
|
public static SummaryRecord forSuccess(String domainName) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), "OK", null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SummaryRecord forSuccess(String domainName, String feedUrl) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), "OK", null, feedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SummaryRecord forError(String domainName, String state, String stateDesc) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), state, stateDesc, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(other instanceof SummaryRecord(String name, Instant updated, String state1, String desc, String url))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return domainName.equals(name) &&
|
||||||
|
lastUpdated.toEpochMilli() == updated.toEpochMilli() &&
|
||||||
|
state.equals(state1) &&
|
||||||
|
(stateDesc == null ? desc == null : stateDesc.equals(desc)) &&
|
||||||
|
(feedUrl == null ? url == null : feedUrl.equals(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return domainName.hashCode() + Long.hashCode(lastUpdated.toEpochMilli());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomainStateDb(Path filename) throws SQLException {
|
||||||
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS summary (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
lastUpdatedEpochMs LONG NOT NULL,
|
||||||
|
state TEXT NOT NULL,
|
||||||
|
stateDesc TEXT,
|
||||||
|
feedUrl TEXT
|
||||||
|
)
|
||||||
|
""");
|
||||||
|
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void save(SummaryRecord record) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, record.domainName());
|
||||||
|
stmt.setLong(2, record.lastUpdated().toEpochMilli());
|
||||||
|
stmt.setString(3, record.state());
|
||||||
|
stmt.setString(4, record.stateDesc());
|
||||||
|
stmt.setString(5, record.feedUrl());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to insert summary record", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<SummaryRecord> get(String domainName) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||||
|
FROM summary
|
||||||
|
WHERE domain = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(new SummaryRecord(
|
||||||
|
rs.getString("domain"),
|
||||||
|
Instant.ofEpochMilli(rs.getLong("lastUpdatedEpochMs")),
|
||||||
|
rs.getString("state"),
|
||||||
|
rs.getString("stateDesc"),
|
||||||
|
rs.getString("feedUrl")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to get summary record", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
public ContentTypeProbeResult probeContentType(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) throws RateLimitException {
|
||||||
if (tags.isEmpty()) {
|
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
var headBuilder = new Request.Builder().head()
|
var headBuilder = new Request.Builder().head()
|
||||||
.addHeader("User-agent", userAgentString)
|
.addHeader("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.addHeader("Accept-Encoding", "gzip")
|
||||||
|
@@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@@ -16,7 +17,9 @@ import nu.marginalia.ip_blocklist.UrlBlocklist;
|
|||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -46,6 +49,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
@@ -55,8 +59,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
DomainProber domainProber,
|
DomainProber domainProber,
|
||||||
CrawlerMain.CrawlSpecRecord specs,
|
CrawlerMain.CrawlSpecRecord specs,
|
||||||
|
DomainStateDb domainStateDb,
|
||||||
WarcRecorder warcRecorder)
|
WarcRecorder warcRecorder)
|
||||||
{
|
{
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
this.warcRecorder = warcRecorder;
|
this.warcRecorder = warcRecorder;
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
@@ -90,8 +96,21 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
EdgeUrl rootUrl;
|
EdgeUrl rootUrl;
|
||||||
if (probeRootUrl() instanceof HttpFetcher.DomainProbeResult.Ok ok) rootUrl = ok.probedUrl();
|
|
||||||
else return 1;
|
var probeResult = probeRootUrl();
|
||||||
|
switch (probeResult) {
|
||||||
|
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
||||||
|
rootUrl = probedUrl; // Good track
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
// so we don't know the crawl delay
|
// so we don't know the crawl delay
|
||||||
@@ -114,7 +133,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
|
||||||
sniffRootDocument(rootUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
|
||||||
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||||
@@ -196,7 +216,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return domainProbeResult;
|
return domainProbeResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
|
Optional<String> feedLink = Optional.empty();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
@@ -204,11 +226,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||||
return;
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
|
||||||
var optDoc = ok.parseDocument();
|
var optDoc = ok.parseDocument();
|
||||||
if (optDoc.isEmpty())
|
if (optDoc.isEmpty())
|
||||||
return;
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
|
||||||
// Sniff the software based on the sample document
|
// Sniff the software based on the sample document
|
||||||
var doc = optDoc.get();
|
var doc = optDoc.get();
|
||||||
@@ -216,7 +238,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
||||||
|
|
||||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||||
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
|
||||||
|
|
||||||
for (var link : doc.getElementsByTag("link")) {
|
for (var link : doc.getElementsByTag("link")) {
|
||||||
String rel = link.attr("rel");
|
String rel = link.attr("rel");
|
||||||
@@ -232,23 +253,33 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
// Grab the RSS/Atom as a sitemap if it exists
|
// Grab the RSS/Atom as a sitemap if it exists
|
||||||
if (rel.equalsIgnoreCase("alternate")
|
if (rel.equalsIgnoreCase("alternate")
|
||||||
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
|
&& (type.equalsIgnoreCase("application/atom+xml")
|
||||||
|
|| type.equalsIgnoreCase("application/atomsvc+xml")
|
||||||
|
|| type.equalsIgnoreCase("application/rss+xml")
|
||||||
|
)) {
|
||||||
String href = link.attr("href");
|
String href = link.attr("href");
|
||||||
|
|
||||||
sitemapUrl = linkParser.parseLink(url, href)
|
feedLink = linkParser.parseLink(url, href)
|
||||||
.filter(crawlFrontier::isSameDomain);
|
.filter(crawlFrontier::isSameDomain)
|
||||||
|
.map(EdgeUrl::toString);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download the sitemap if available exists
|
|
||||||
if (sitemapUrl.isPresent()) {
|
if (feedLink.isEmpty()) {
|
||||||
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
|
feedLink = guessFeedUrl(timer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download the sitemap if available
|
||||||
|
if (feedLink.isPresent()) {
|
||||||
|
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
@@ -256,6 +287,74 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
finally {
|
finally {
|
||||||
crawlFrontier.addVisited(rootUrl);
|
crawlFrontier.addVisited(rootUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (feedLink.isPresent()) {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain, feedLink.get());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final List<String> likelyFeedEndpoints = List.of(
|
||||||
|
"rss.xml",
|
||||||
|
"atom.xml",
|
||||||
|
"feed.xml",
|
||||||
|
"index.xml",
|
||||||
|
"feed",
|
||||||
|
"rss",
|
||||||
|
"atom",
|
||||||
|
"feeds",
|
||||||
|
"blog/feed",
|
||||||
|
"blog/rss"
|
||||||
|
);
|
||||||
|
|
||||||
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
|
var oldDomainStateRecord = domainStateDb.get(domain);
|
||||||
|
|
||||||
|
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||||
|
if (oldDomainStateRecord.isPresent()) {
|
||||||
|
var oldRecord = oldDomainStateRecord.get();
|
||||||
|
if (oldRecord.feedUrl() != null && validateFeedUrl(oldRecord.feedUrl(), timer)) {
|
||||||
|
return Optional.of(oldRecord.feedUrl());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String endpoint : likelyFeedEndpoints) {
|
||||||
|
String url = "https://" + domain + "/" + endpoint;
|
||||||
|
if (validateFeedUrl(url, timer)) {
|
||||||
|
return Optional.of(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean validateFeedUrl(String url, CrawlDelayTimer timer) throws InterruptedException {
|
||||||
|
var parsedOpt = EdgeUrl.parse(url);
|
||||||
|
if (parsedOpt.isEmpty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the beginning of the
|
||||||
|
Optional<String> bodyOpt = DocumentBodyExtractor.asString(ok).getBody();
|
||||||
|
if (bodyOpt.isEmpty())
|
||||||
|
return false;
|
||||||
|
String body = bodyOpt.get();
|
||||||
|
body = body.substring(0, Math.min(128, body.length())).toLowerCase();
|
||||||
|
|
||||||
|
if (body.contains("<atom"))
|
||||||
|
return true;
|
||||||
|
if (body.contains("<rss"))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
||||||
|
@@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
public class SitemapFetcher {
|
public class SitemapFetcher {
|
||||||
@@ -24,26 +24,27 @@ public class SitemapFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
||||||
List<String> sitemaps = robotsRules.getSitemaps();
|
List<String> urls = robotsRules.getSitemaps();
|
||||||
|
|
||||||
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
|
if (urls.isEmpty()) {
|
||||||
if (!sitemaps.isEmpty()) {
|
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
|
||||||
for (var url : sitemaps) {
|
|
||||||
EdgeUrl.parse(url).ifPresent(urls::add);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
downloadSitemaps(urls);
|
downloadSitemaps(urls);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void downloadSitemaps(List<EdgeUrl> urls) {
|
public void downloadSitemaps(List<String> urls) {
|
||||||
|
|
||||||
Set<String> checkedSitemaps = new HashSet<>();
|
Set<String> checkedSitemaps = new HashSet<>();
|
||||||
|
|
||||||
for (var url : urls) {
|
for (var rawUrl : urls) {
|
||||||
|
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
|
||||||
|
if (parsedUrl.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
EdgeUrl url = parsedUrl.get();
|
||||||
|
|
||||||
// Let's not download sitemaps from other domains for now
|
// Let's not download sitemaps from other domains for now
|
||||||
if (!crawlFrontier.isSameDomain(url)) {
|
if (!crawlFrontier.isSameDomain(url)) {
|
||||||
continue;
|
continue;
|
||||||
|
@@ -18,6 +18,7 @@ public class ContentTypeLogic {
|
|||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"application/xml",
|
"application/xml",
|
||||||
"application/atom+xml",
|
"application/atom+xml",
|
||||||
|
"application/atomsvc+xml",
|
||||||
"application/rss+xml",
|
"application/rss+xml",
|
||||||
"application/x-rss+xml",
|
"application/x-rss+xml",
|
||||||
"application/rdf+xml",
|
"application/rdf+xml",
|
||||||
|
@@ -23,6 +23,10 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
return mapper.apply(contentType, body);
|
return mapper.apply(contentType, body);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Optional<T> getBody() {
|
||||||
|
return Optional.of(body);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
consumer.accept(contentType, body);
|
consumer.accept(contentType, body);
|
||||||
@@ -41,6 +45,11 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
return (DocumentBodyResult<T2>) this;
|
return (DocumentBodyResult<T2>) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<T> getBody() {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
}
|
}
|
||||||
@@ -49,6 +58,7 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
|
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
|
||||||
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
|
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
|
||||||
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
|
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
|
||||||
|
Optional<T> getBody();
|
||||||
|
|
||||||
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
|
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
|
||||||
|
|
||||||
|
@@ -0,0 +1,66 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class DomainStateDbTest {
|
||||||
|
|
||||||
|
Path tempFile;
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws IOException {
|
||||||
|
tempFile = Files.createTempFile(getClass().getSimpleName(), ".db");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSunnyDay() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
var allFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"all.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"OK",
|
||||||
|
"Bad address",
|
||||||
|
"https://www.marginalia.nu/atom.xml"
|
||||||
|
);
|
||||||
|
|
||||||
|
var minFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"min.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"OK",
|
||||||
|
null,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
db.save(allFields);
|
||||||
|
db.save(minFields);
|
||||||
|
|
||||||
|
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
||||||
|
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
||||||
|
|
||||||
|
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"all.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"BAD",
|
||||||
|
null,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
db.save(updatedAllFields);
|
||||||
|
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@@ -42,24 +42,24 @@ class ContentTypeProberTest {
|
|||||||
port = r.nextInt(10000) + 8000;
|
port = r.nextInt(10000) + 8000;
|
||||||
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
||||||
|
|
||||||
server.createContext("/html", exchange -> {
|
server.createContext("/html.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
exchange.sendResponseHeaders(200, -1);
|
exchange.sendResponseHeaders(200, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
server.createContext("/redir", exchange -> {
|
server.createContext("/redir.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Location", "/html");
|
exchange.getResponseHeaders().add("Location", "/html.gz");
|
||||||
exchange.sendResponseHeaders(301, -1);
|
exchange.sendResponseHeaders(301, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
server.createContext("/bin", exchange -> {
|
server.createContext("/bin.gz", exchange -> {
|
||||||
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||||
exchange.sendResponseHeaders(200, -1);
|
exchange.sendResponseHeaders(200, -1);
|
||||||
exchange.close();
|
exchange.close();
|
||||||
});
|
});
|
||||||
|
|
||||||
server.createContext("/timeout", exchange -> {
|
server.createContext("/timeout.gz", exchange -> {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(15_000);
|
Thread.sleep(15_000);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
@@ -73,10 +73,10 @@ class ContentTypeProberTest {
|
|||||||
|
|
||||||
server.start();
|
server.start();
|
||||||
|
|
||||||
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
|
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html.gz").get();
|
||||||
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
|
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin.gz").get();
|
||||||
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
|
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout.gz").get();
|
||||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl("test");
|
fetcher = new HttpFetcherImpl("test");
|
||||||
recorder = new WarcRecorder(warcFile);
|
recorder = new WarcRecorder(warcFile);
|
||||||
|
@@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
|
|||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@@ -18,6 +19,7 @@ import nu.marginalia.model.crawldata.SerializableCrawlData;
|
|||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import okhttp3.Headers;
|
import okhttp3.Headers;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@@ -25,6 +27,9 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -36,9 +41,14 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
||||||
HttpFetcher fetcherMock = new MockFetcher();
|
HttpFetcher fetcherMock = new MockFetcher();
|
||||||
|
private Path dbTempFile;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
dbTempFile = Files.createTempFile("domains","db");
|
||||||
|
}
|
||||||
@AfterEach
|
@AfterEach
|
||||||
public void tearDown() {
|
public void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(dbTempFile);
|
||||||
mockData.clear();
|
mockData.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -66,15 +76,17 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException {
|
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException, SQLException {
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder();
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
var db = new DomainStateDb(dbTempFile)
|
||||||
|
) {
|
||||||
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, db, recorder)
|
||||||
.crawlDomain();
|
.crawlDomain();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLemmy() throws URISyntaxException, IOException {
|
public void testLemmy() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
||||||
@@ -85,7 +97,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMediawiki() throws URISyntaxException, IOException {
|
public void testMediawiki() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
@@ -94,7 +106,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDiscourse() throws URISyntaxException, IOException {
|
public void testDiscourse() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
||||||
|
@@ -4,6 +4,7 @@ import nu.marginalia.UserAgent;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@@ -25,6 +26,7 @@ import java.io.RandomAccessFile;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@@ -39,11 +41,13 @@ class CrawlerRetreiverTest {
|
|||||||
Path tempFileWarc2;
|
Path tempFileWarc2;
|
||||||
Path tempFileParquet2;
|
Path tempFileParquet2;
|
||||||
Path tempFileWarc3;
|
Path tempFileWarc3;
|
||||||
|
Path tempFileDb;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
||||||
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
||||||
|
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -505,22 +509,26 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
var db = new DomainStateDb(tempFileDb)
|
||||||
|
) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||||
new CrawlDataReference(stream));
|
new CrawlDataReference(stream));
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException | SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
var db = new DomainStateDb(tempFileDb)
|
||||||
|
) {
|
||||||
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||||
crawler.crawlDomain();
|
crawler.crawlDomain();
|
||||||
return crawler.getCrawlFrontier();
|
return crawler.getCrawlFrontier();
|
||||||
} catch (IOException ex) {
|
} catch (IOException| SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
return null; // unreachable
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
|
@@ -179,6 +179,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
|||||||
EdgeDomain domain = new EdgeDomain(entry.getKey());
|
EdgeDomain domain = new EdgeDomain(entry.getKey());
|
||||||
List<String> urls = entry.getValue();
|
List<String> urls = entry.getValue();
|
||||||
|
|
||||||
|
if (urls.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
fetcher.scheduleRetrieval(domain, urls);
|
fetcher.scheduleRetrieval(domain, urls);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -3,7 +3,10 @@ package nu.marginalia.livecrawler;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.logic.DomainLocks;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.db.DbDomainQueries;
|
import nu.marginalia.db.DbDomainQueries;
|
||||||
import nu.marginalia.db.DomainBlacklist;
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
@@ -15,6 +18,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
@@ -26,6 +30,7 @@ import java.util.List;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.ThreadLocalRandom;
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||||
@@ -40,6 +45,7 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
private final DomainBlacklist domainBlacklist;
|
private final DomainBlacklist domainBlacklist;
|
||||||
private final Duration connectTimeout = Duration.ofSeconds(10);
|
private final Duration connectTimeout = Duration.ofSeconds(10);
|
||||||
private final Duration readTimeout = Duration.ofSeconds(10);
|
private final Duration readTimeout = Duration.ofSeconds(10);
|
||||||
|
private final DomainLocks domainLocks = new DomainLocks();
|
||||||
|
|
||||||
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
||||||
DbDomainQueries domainQueries,
|
DbDomainQueries domainQueries,
|
||||||
@@ -65,7 +71,9 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
.connectTimeout(connectTimeout)
|
.connectTimeout(connectTimeout)
|
||||||
.followRedirects(HttpClient.Redirect.NEVER)
|
.followRedirects(HttpClient.Redirect.NEVER)
|
||||||
.version(HttpClient.Version.HTTP_2)
|
.version(HttpClient.Version.HTTP_2)
|
||||||
.build()) {
|
.build();
|
||||||
|
DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) // throttle concurrent access per domain; do not remove
|
||||||
|
) {
|
||||||
|
|
||||||
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
||||||
|
|
||||||
@@ -124,6 +132,7 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||||
.GET()
|
.GET()
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
|
.header("Accept-Encoding","gzip")
|
||||||
.timeout(readTimeout);
|
.timeout(readTimeout);
|
||||||
|
|
||||||
// Fetch the robots.txt
|
// Fetch the robots.txt
|
||||||
@@ -131,9 +140,10 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||||
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
if (robotsTxt.statusCode() == 200) {
|
if (robotsTxt.statusCode() == 200) {
|
||||||
return parser.parseContent(rootUrl.toString(),
|
return parser.parseContent(rootUrl.toString(),
|
||||||
robotsTxt.body(),
|
getResponseData(robotsTxt),
|
||||||
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||||
WmsaHome.getUserAgent().uaIdentifier());
|
WmsaHome.getUserAgent().uaIdentifier());
|
||||||
}
|
}
|
||||||
@@ -157,18 +167,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
.GET()
|
.GET()
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
.header("Accept", "text/html")
|
.header("Accept", "text/html")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
.timeout(readTimeout)
|
.timeout(readTimeout)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
// Handle rate limiting by waiting and retrying once
|
// Handle rate limiting by waiting and retrying once
|
||||||
if (response.statusCode() == 429) {
|
if (response.statusCode() == 429) {
|
||||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||||
response.headers().firstValue("Retry-After").orElse("5")
|
response.headers().firstValue("Retry-After").orElse("5")
|
||||||
));
|
));
|
||||||
response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||||
@@ -178,12 +189,14 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
String body = response.body();
|
byte[] body = getResponseData(response);
|
||||||
if (body.length() > 1024 * 1024) {
|
if (body.length > 1024 * 1024) {
|
||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||||
|
|
||||||
|
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
@@ -194,6 +207,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||||
|
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||||
|
|
||||||
|
if ("gzip".equals(encoding)) {
|
||||||
|
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||||
|
return stream.readAllBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return response.body();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sealed interface FetchResult {
|
sealed interface FetchResult {
|
||||||
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||||
record Error(EdgeUrl url) implements FetchResult {}
|
record Error(EdgeUrl url) implements FetchResult {}
|
||||||
|
@@ -0,0 +1,68 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import nu.marginalia.db.DomainBlacklistImpl;
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class SimpleLinkScraperTest {
|
||||||
|
private Path tempDir;
|
||||||
|
private LiveCrawlDataSet dataSet;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException, SQLException {
|
||||||
|
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||||
|
dataSet = new LiveCrawlDataSet(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
dataSet.close();
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRetrieveNow() throws Exception {
|
||||||
|
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||||
|
scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||||
|
|
||||||
|
var streams = dataSet.getDataStreams();
|
||||||
|
Assertions.assertEquals(1, streams.size());
|
||||||
|
|
||||||
|
SerializableCrawlDataStream firstStream = streams.iterator().next();
|
||||||
|
Assertions.assertTrue(firstStream.hasNext());
|
||||||
|
|
||||||
|
if (firstStream.next() instanceof CrawledDomain domain) {
|
||||||
|
Assertions.assertEquals("www.marginalia.nu",domain.getDomain());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Assertions.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertTrue(firstStream.hasNext());
|
||||||
|
|
||||||
|
if ((firstStream.next() instanceof CrawledDocument document)) {
|
||||||
|
// verify we decompress the body string
|
||||||
|
Assertions.assertTrue(document.documentBody.startsWith("<!doctype"));
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
Assertions.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertFalse(firstStream.hasNext());
|
||||||
|
}
|
||||||
|
}
|
@@ -1,39 +1,19 @@
|
|||||||
package nu.marginalia.tools.experiments;
|
package nu.marginalia.tools.experiments;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import nu.marginalia.tools.Experiment;
|
import nu.marginalia.tools.Experiment;
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
public class SiteStatisticsExperiment extends Experiment {
|
public class SiteStatisticsExperiment extends Experiment {
|
||||||
|
|
||||||
|
|
||||||
private final DomainProcessor domainProcessor;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SiteStatisticsExperiment(DomainProcessor domainProcessor) {
|
public SiteStatisticsExperiment() {
|
||||||
this.domainProcessor = domainProcessor;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean process(SerializableCrawlDataStream stream) {
|
public boolean process(SerializableCrawlDataStream stream) {
|
||||||
var ret = domainProcessor.fullProcessing(stream);
|
|
||||||
|
|
||||||
ret.documents.stream()
|
|
||||||
.filter(ProcessedDocument::isProcessedFully)
|
|
||||||
.sorted(Comparator.comparing(doc -> doc.details.metadata.topology()))
|
|
||||||
.flatMap(doc -> doc.details.feedLinks.stream())
|
|
||||||
.map(EdgeUrl::toString)
|
|
||||||
.min(Comparator.comparing(String::length))
|
|
||||||
.ifPresent(url -> {
|
|
||||||
System.out.printf("\"%s\",\"%s\"\n", ret.domain, url);
|
|
||||||
});
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import subprocess, os
|
import subprocess, os
|
||||||
from typing import List, Set, Dict, Optional
|
from typing import List, Set, Dict, Optional
|
||||||
|
import argparse
|
||||||
|
|
||||||
build_dir = "/app/search.marginalia.nu/build"
|
build_dir = "/app/search.marginalia.nu/build"
|
||||||
docker_dir = "/app/search.marginalia.nu/docker"
|
docker_dir = "/app/search.marginalia.nu/docker"
|
||||||
@@ -12,11 +13,12 @@ class ServiceConfig:
|
|||||||
docker_name: str
|
docker_name: str
|
||||||
instances: int | None
|
instances: int | None
|
||||||
deploy_tier: int
|
deploy_tier: int
|
||||||
|
groups: Set[str]
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DeploymentPlan:
|
class DeploymentPlan:
|
||||||
services_to_build: List[str]
|
services_to_build: List[str]
|
||||||
instances_to_hold: Set[str]
|
instances_to_deploy: Set[str]
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DockerContainer:
|
class DockerContainer:
|
||||||
@@ -72,24 +74,49 @@ def parse_deployment_tags(
|
|||||||
instances_to_hold = set()
|
instances_to_hold = set()
|
||||||
|
|
||||||
available_services = set(service_config.keys())
|
available_services = set(service_config.keys())
|
||||||
|
available_groups = set()
|
||||||
|
|
||||||
|
partitions = set()
|
||||||
|
|
||||||
|
for service in service_config.values():
|
||||||
|
available_groups = available_groups | service.groups
|
||||||
|
|
||||||
for tag in [tag.strip() for tag in tag_messages]:
|
for tag in [tag.strip() for tag in tag_messages]:
|
||||||
|
if tag.startswith('partition:'):
|
||||||
|
for p in tag[10:].strip().split(','):
|
||||||
|
partitions.add(int(p))
|
||||||
if tag.startswith('deploy:'):
|
if tag.startswith('deploy:'):
|
||||||
parts = tag[7:].strip().split(',')
|
parts = tag[7:].strip().split(',')
|
||||||
|
|
||||||
for part in parts:
|
for part in parts:
|
||||||
part = part.strip()
|
part = part.strip()
|
||||||
if part == 'all':
|
|
||||||
services_to_build.update(available_services)
|
if part.startswith('-'):
|
||||||
elif part.startswith('-'):
|
service = part[1:]
|
||||||
services_to_exclude.add(part[1:])
|
if not service in available_services:
|
||||||
|
raise ValueError(f"Unknown service {service}")
|
||||||
|
|
||||||
|
services_to_exclude.add(service)
|
||||||
elif part.startswith('+'):
|
elif part.startswith('+'):
|
||||||
services_to_build.add(part[1:])
|
service = part[1:]
|
||||||
|
if not service in available_services:
|
||||||
|
raise ValueError(f"Unknown service {service}")
|
||||||
|
|
||||||
|
services_to_build.add(service)
|
||||||
|
else:
|
||||||
|
group = part
|
||||||
|
if not group in available_groups:
|
||||||
|
raise ValueError(f"Unknown service group {group}")
|
||||||
|
for name, service in service_config.items():
|
||||||
|
if group in service.groups:
|
||||||
|
services_to_build.add(name)
|
||||||
|
|
||||||
elif tag.startswith('hold:'):
|
elif tag.startswith('hold:'):
|
||||||
instances = tag[5:].strip().split(',')
|
instances = tag[5:].strip().split(',')
|
||||||
instances_to_hold.update(i.strip() for i in instances if i.strip())
|
instances_to_hold.update(i.strip() for i in instances if i.strip())
|
||||||
|
|
||||||
|
print(partitions)
|
||||||
|
|
||||||
# Remove any explicitly excluded services
|
# Remove any explicitly excluded services
|
||||||
services_to_build = services_to_build - services_to_exclude
|
services_to_build = services_to_build - services_to_exclude
|
||||||
|
|
||||||
@@ -98,9 +125,32 @@ def parse_deployment_tags(
|
|||||||
if invalid_services:
|
if invalid_services:
|
||||||
raise ValueError(f"Unknown services specified: {invalid_services}")
|
raise ValueError(f"Unknown services specified: {invalid_services}")
|
||||||
|
|
||||||
|
to_deploy = list()
|
||||||
|
for service in services_to_build:
|
||||||
|
config = service_config[service]
|
||||||
|
|
||||||
|
if config.instances == None:
|
||||||
|
if config.docker_name in instances_to_hold:
|
||||||
|
continue
|
||||||
|
container = DockerContainer(config.docker_name, 0, config)
|
||||||
|
|
||||||
|
if len(partitions) == 0 or 0 in partitions:
|
||||||
|
to_deploy.append(container)
|
||||||
|
else:
|
||||||
|
for instance in range(1,config.instances + 1):
|
||||||
|
if config.docker_name in instances_to_hold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
container_name = f"{config.docker_name}-{instance}"
|
||||||
|
if container_name in instances_to_hold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(partitions) == 0 or instance in partitions:
|
||||||
|
to_deploy.append(DockerContainer(container_name, instance, config))
|
||||||
|
|
||||||
return DeploymentPlan(
|
return DeploymentPlan(
|
||||||
services_to_build=sorted(list(services_to_build)),
|
services_to_build=sorted(list(services_to_build)),
|
||||||
instances_to_hold=instances_to_hold
|
instances_to_deploy=sorted(to_deploy, key = lambda c : c.deploy_key())
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -118,6 +168,7 @@ def deploy_container(container: DockerContainer) -> None:
|
|||||||
text=True
|
text=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# Stream output in real-time
|
# Stream output in real-time
|
||||||
while True:
|
while True:
|
||||||
output = process.stdout.readline()
|
output = process.stdout.readline()
|
||||||
@@ -131,49 +182,27 @@ def deploy_container(container: DockerContainer) -> None:
|
|||||||
raise BuildError(container, return_code)
|
raise BuildError(container, return_code)
|
||||||
|
|
||||||
def deploy_services(containers: List[str]) -> None:
|
def deploy_services(containers: List[str]) -> None:
|
||||||
cwd = os.getcwd()
|
print(f"Deploying {containers}")
|
||||||
os.chdir(docker_dir)
|
os.chdir(docker_dir)
|
||||||
|
|
||||||
for container in containers:
|
for container in containers:
|
||||||
deploy_container(container)
|
deploy_container(container)
|
||||||
|
|
||||||
def build_and_deploy(plan: DeploymentPlan, service_config: Dict[str, ServiceConfig]):
|
def build_and_deploy(plan: DeploymentPlan, service_config: Dict[str, ServiceConfig]):
|
||||||
"""Execute the deployment plan"""
|
"""Execute the deployment plan"""
|
||||||
for service in plan.services_to_build:
|
run_gradle_build([service_config[service].gradle_target for service in plan.services_to_build])
|
||||||
config = service_config[service]
|
|
||||||
print(f"Building {service}:")
|
|
||||||
run_gradle_build(service, config.gradle_target)
|
|
||||||
|
|
||||||
to_deploy = list()
|
deploy_services(plan.instances_to_deploy)
|
||||||
for service in plan.services_to_build:
|
|
||||||
config = service_config[service]
|
|
||||||
|
|
||||||
if config.instances == None:
|
|
||||||
if config.docker_name in plan.instances_to_hold:
|
|
||||||
continue
|
|
||||||
container = DockerContainer(config.docker_name, 0, config)
|
|
||||||
|
|
||||||
to_deploy.append(container)
|
|
||||||
else:
|
|
||||||
for instance in range(1,config.instances + 1):
|
|
||||||
container_name = f"{config.docker_name}-{instance}"
|
|
||||||
if container_name in plan.instances_to_hold:
|
|
||||||
continue
|
|
||||||
to_deploy.append(DockerContainer(container_name, instance, config))
|
|
||||||
to_deploy = sorted(to_deploy, key = lambda c : c.deploy_key())
|
|
||||||
|
|
||||||
deploy_services(to_deploy)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run_gradle_build(service: str, target: str) -> None:
|
def run_gradle_build(targets: str) -> None:
|
||||||
"""
|
"""
|
||||||
Run a Gradle build for the specified service and target.
|
Run a Gradle build for the specified target.
|
||||||
Raises BuildError if the build fails.
|
Raises BuildError if the build fails.
|
||||||
"""
|
"""
|
||||||
print(f"\nBuilding {service} with target {target}")
|
print(f"\nBuilding targets {targets}")
|
||||||
process = subprocess.Popen(
|
process = subprocess.Popen(
|
||||||
['./gradlew', '-q', target],
|
['./gradlew', '-q'] + targets,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.STDOUT,
|
stderr=subprocess.STDOUT,
|
||||||
text=True
|
text=True
|
||||||
@@ -199,73 +228,98 @@ if __name__ == '__main__':
|
|||||||
gradle_target=':code:services-application:search-service:docker',
|
gradle_target=':code:services-application:search-service:docker',
|
||||||
docker_name='search-service',
|
docker_name='search-service',
|
||||||
instances=2,
|
instances=2,
|
||||||
deploy_tier=2
|
deploy_tier=2,
|
||||||
|
groups={"all", "frontend", "core"}
|
||||||
),
|
),
|
||||||
'api': ServiceConfig(
|
'api': ServiceConfig(
|
||||||
gradle_target=':code:services-application:api-service:docker',
|
gradle_target=':code:services-application:api-service:docker',
|
||||||
docker_name='api-service',
|
docker_name='api-service',
|
||||||
instances=2,
|
instances=2,
|
||||||
deploy_tier=1
|
deploy_tier=1,
|
||||||
|
groups={"all", "core"}
|
||||||
),
|
),
|
||||||
'api': ServiceConfig(
|
'assistant': ServiceConfig(
|
||||||
gradle_target=':code:services-core:assistant-service:docker',
|
gradle_target=':code:services-core:assistant-service:docker',
|
||||||
docker_name='assistant-service',
|
docker_name='assistant-service',
|
||||||
instances=2,
|
instances=2,
|
||||||
deploy_tier=2
|
deploy_tier=2,
|
||||||
|
groups={"all", "core"}
|
||||||
),
|
),
|
||||||
'explorer': ServiceConfig(
|
'explorer': ServiceConfig(
|
||||||
gradle_target=':code:services-application:explorer-service:docker',
|
gradle_target=':code:services-application:explorer-service:docker',
|
||||||
docker_name='explorer-service',
|
docker_name='explorer-service',
|
||||||
instances=1,
|
instances=None,
|
||||||
deploy_tier=1
|
deploy_tier=1,
|
||||||
|
groups={"all", "extra"}
|
||||||
),
|
),
|
||||||
'dating': ServiceConfig(
|
'dating': ServiceConfig(
|
||||||
gradle_target=':code:services-application:dating-service:docker',
|
gradle_target=':code:services-application:dating-service:docker',
|
||||||
docker_name='dating-service',
|
docker_name='dating-service',
|
||||||
instances=1,
|
instances=None,
|
||||||
deploy_tier=1
|
deploy_tier=1,
|
||||||
|
groups={"all", "extra"}
|
||||||
),
|
),
|
||||||
'index': ServiceConfig(
|
'index': ServiceConfig(
|
||||||
gradle_target=':code:services-core:index-service:docker',
|
gradle_target=':code:services-core:index-service:docker',
|
||||||
docker_name='index-service',
|
docker_name='index-service',
|
||||||
instances=10,
|
instances=10,
|
||||||
deploy_tier=3
|
deploy_tier=3,
|
||||||
|
groups={"all", "index"}
|
||||||
),
|
),
|
||||||
'executor': ServiceConfig(
|
'executor': ServiceConfig(
|
||||||
gradle_target=':code:services-core:executor-service:docker',
|
gradle_target=':code:services-core:executor-service:docker',
|
||||||
docker_name='executor-service',
|
docker_name='executor-service',
|
||||||
instances=10,
|
instances=10,
|
||||||
deploy_tier=3
|
deploy_tier=3,
|
||||||
|
groups={"all", "executor"}
|
||||||
),
|
),
|
||||||
'control': ServiceConfig(
|
'control': ServiceConfig(
|
||||||
gradle_target=':code:services-core:control-service:docker',
|
gradle_target=':code:services-core:control-service:docker',
|
||||||
docker_name='control-service',
|
docker_name='control-service',
|
||||||
instances=None,
|
instances=None,
|
||||||
deploy_tier=0
|
deploy_tier=0,
|
||||||
|
groups={"all", "core"}
|
||||||
),
|
),
|
||||||
'query': ServiceConfig(
|
'query': ServiceConfig(
|
||||||
gradle_target=':code:services-core:query-service:docker',
|
gradle_target=':code:services-core:query-service:docker',
|
||||||
docker_name='query-service',
|
docker_name='query-service',
|
||||||
instances=2,
|
instances=2,
|
||||||
deploy_tier=2
|
deploy_tier=2,
|
||||||
|
groups={"all", "query"}
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tags = get_deployment_tag()
|
parser = argparse.ArgumentParser(
|
||||||
if tags == None:
|
prog='deployment.py',
|
||||||
exit
|
description='Continuous Deployment helper')
|
||||||
|
parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true')
|
||||||
|
parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-')
|
||||||
|
|
||||||
print(tags)
|
args = parser.parse_args()
|
||||||
|
tags = args.tag
|
||||||
|
if tags is None:
|
||||||
|
tags = get_deployment_tag()
|
||||||
|
else:
|
||||||
|
tags = tags.split(' ')
|
||||||
|
|
||||||
plan = parse_deployment_tags(tags, SERVICE_CONFIG)
|
|
||||||
print("\nDeployment Plan:")
|
|
||||||
print("Services to build:", plan.services_to_build)
|
|
||||||
print("Instances to hold:", plan.instances_to_hold)
|
|
||||||
|
|
||||||
print("\nExecution Plan:")
|
|
||||||
|
|
||||||
build_and_deploy(plan, SERVICE_CONFIG)
|
if tags != None:
|
||||||
|
print("Found deployment tags:", tags)
|
||||||
|
|
||||||
|
plan = parse_deployment_tags(tags, SERVICE_CONFIG)
|
||||||
|
|
||||||
|
print("\nDeployment Plan:")
|
||||||
|
print("Services to build:", plan.services_to_build)
|
||||||
|
print("Instances to deploy:", [container.name for container in plan.instances_to_deploy])
|
||||||
|
|
||||||
|
if not args.verify:
|
||||||
|
print("\nExecution Plan:")
|
||||||
|
|
||||||
|
build_and_deploy(plan, SERVICE_CONFIG)
|
||||||
|
else:
|
||||||
|
print("No tags found")
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
|
Reference in New Issue
Block a user