mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
6 Commits
deploy-008
...
deploy-009
Author | SHA1 | Date | |
---|---|---|---|
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 |
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
@@ -30,6 +30,8 @@ public class MetricsServer {
|
||||
|
||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||
|
||||
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||
|
||||
server.start();
|
||||
}
|
||||
catch (Exception|NoSuchMethodError ex) {
|
||||
|
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
|
||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||
import nu.marginalia.slop.column.string.EnumColumn;
|
||||
import nu.marginalia.slop.column.string.StringColumn;
|
||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
||||
import nu.marginalia.slop.desc.StorageType;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
// Basic information
|
||||
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
||||
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
|
||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||
|
||||
public static class KeywordsProjectionReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final IntColumn.Reader htmlFeaturesReader;
|
||||
private final LongColumn.Reader domainMetadataReader;
|
||||
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class MetadataReader extends SlopTable {
|
||||
private final TxtStringColumn.Reader domainsReader;
|
||||
private final TxtStringColumn.Reader urlsReader;
|
||||
private final StringColumn.Reader domainsReader;
|
||||
private final StringColumn.Reader urlsReader;
|
||||
private final VarintColumn.Reader ordinalsReader;
|
||||
private final StringColumn.Reader titlesReader;
|
||||
private final StringColumn.Reader descriptionsReader;
|
||||
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
|
||||
}
|
||||
|
||||
public static class Writer extends SlopTable {
|
||||
private final TxtStringColumn.Writer domainsWriter;
|
||||
private final TxtStringColumn.Writer urlsWriter;
|
||||
private final StringColumn.Writer domainsWriter;
|
||||
private final StringColumn.Writer urlsWriter;
|
||||
private final VarintColumn.Writer ordinalsWriter;
|
||||
private final EnumColumn.Writer statesWriter;
|
||||
private final StringColumn.Writer stateReasonsWriter;
|
||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.security.Security;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// (this happens when the process is restarted after a crash or a shutdown)
|
||||
tasksDone.set(workLog.countFinishedJobs());
|
||||
|
||||
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||
List<CrawlTask> deferredTasks = new LinkedList<>();
|
||||
|
||||
// Create crawl tasks and submit them to the pool for execution
|
||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||
continue;
|
||||
|
||||
var task = new CrawlTask(
|
||||
// Add to the end of the deferral list
|
||||
deferredTasks.addLast(new CrawlTask(
|
||||
crawlSpec,
|
||||
anchorTagsSource,
|
||||
outputDir,
|
||||
warcArchiver,
|
||||
domainStateDb,
|
||||
workLog);
|
||||
workLog));
|
||||
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||
pool.submitQuietly(task);
|
||||
}
|
||||
// Start every task we currently can from the deferral list
|
||||
deferredTasks.removeIf(task -> {
|
||||
if (task.canRun()) {
|
||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
||||
return true; // task has already run, duplicate in crawl specs
|
||||
}
|
||||
|
||||
// This blocks the caller when the pool is full
|
||||
pool.submitQuietly(task);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
// Schedule any lingering tasks for immediate execution
|
||||
for (var task : deferredTasks) {
|
||||
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
||||
continue;
|
||||
|
||||
pool.submitQuietly(task);
|
||||
}
|
||||
|
||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
this.id = Integer.toHexString(domain.hashCode());
|
||||
}
|
||||
|
||||
/** Best effort indicator whether we could start this now without getting stuck in
|
||||
* DomainLocks purgatory */
|
||||
public boolean canRun() {
|
||||
return domainLocks.canLock(new EdgeDomain(domain));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() throws Exception {
|
||||
|
||||
|
@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
return new SitemapRetriever();
|
||||
}
|
||||
|
||||
/** Recursively fetch sitemaps */
|
||||
@Override
|
||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||
try {
|
||||
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||
var head = sitemapQueue.removeFirst();
|
||||
|
||||
switch (fetchSitemap(head)) {
|
||||
switch (fetchSingleSitemap(head)) {
|
||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||
|
||||
for (var url : urls) {
|
||||
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
||||
return new Semaphore(2);
|
||||
}
|
||||
|
||||
public boolean canLock(EdgeDomain domain) {
|
||||
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||
if (null == sem)
|
||||
return true;
|
||||
else
|
||||
return sem.availablePermits() > 0;
|
||||
}
|
||||
|
||||
public static class DomainLock implements AutoCloseable {
|
||||
private final String domainName;
|
||||
private final Semaphore semaphore;
|
||||
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
|
||||
DENY_JS("no-js", "special:scripts");
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
Reference in New Issue
Block a user