mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 17:32:39 +02:00
Compare commits
5 Commits
deploy-008
...
deploy-009
Author | SHA1 | Date | |
---|---|---|---|
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a | ||
|
20ca41ec95 |
@@ -11,7 +11,6 @@ import nu.marginalia.slop.column.primitive.IntColumn;
|
|||||||
import nu.marginalia.slop.column.primitive.LongColumn;
|
import nu.marginalia.slop.column.primitive.LongColumn;
|
||||||
import nu.marginalia.slop.column.string.EnumColumn;
|
import nu.marginalia.slop.column.string.EnumColumn;
|
||||||
import nu.marginalia.slop.column.string.StringColumn;
|
import nu.marginalia.slop.column.string.StringColumn;
|
||||||
import nu.marginalia.slop.column.string.TxtStringColumn;
|
|
||||||
import nu.marginalia.slop.desc.StorageType;
|
import nu.marginalia.slop.desc.StorageType;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
@@ -182,8 +181,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Basic information
|
// Basic information
|
||||||
private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
private static final StringColumn domainsColumn = new StringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||||
private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
private static final StringColumn urlsColumn = new StringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP);
|
||||||
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN);
|
||||||
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN);
|
||||||
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP);
|
||||||
@@ -211,7 +210,7 @@ public record SlopDocumentRecord(
|
|||||||
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD);
|
||||||
|
|
||||||
public static class KeywordsProjectionReader extends SlopTable {
|
public static class KeywordsProjectionReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final StringColumn.Reader domainsReader;
|
||||||
private final VarintColumn.Reader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final IntColumn.Reader htmlFeaturesReader;
|
private final IntColumn.Reader htmlFeaturesReader;
|
||||||
private final LongColumn.Reader domainMetadataReader;
|
private final LongColumn.Reader domainMetadataReader;
|
||||||
@@ -275,8 +274,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class MetadataReader extends SlopTable {
|
public static class MetadataReader extends SlopTable {
|
||||||
private final TxtStringColumn.Reader domainsReader;
|
private final StringColumn.Reader domainsReader;
|
||||||
private final TxtStringColumn.Reader urlsReader;
|
private final StringColumn.Reader urlsReader;
|
||||||
private final VarintColumn.Reader ordinalsReader;
|
private final VarintColumn.Reader ordinalsReader;
|
||||||
private final StringColumn.Reader titlesReader;
|
private final StringColumn.Reader titlesReader;
|
||||||
private final StringColumn.Reader descriptionsReader;
|
private final StringColumn.Reader descriptionsReader;
|
||||||
@@ -332,8 +331,8 @@ public record SlopDocumentRecord(
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static class Writer extends SlopTable {
|
public static class Writer extends SlopTable {
|
||||||
private final TxtStringColumn.Writer domainsWriter;
|
private final StringColumn.Writer domainsWriter;
|
||||||
private final TxtStringColumn.Writer urlsWriter;
|
private final StringColumn.Writer urlsWriter;
|
||||||
private final VarintColumn.Writer ordinalsWriter;
|
private final VarintColumn.Writer ordinalsWriter;
|
||||||
private final EnumColumn.Writer statesWriter;
|
private final EnumColumn.Writer statesWriter;
|
||||||
private final StringColumn.Writer stateReasonsWriter;
|
private final StringColumn.Writer stateReasonsWriter;
|
||||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// (this happens when the process is restarted after a crash or a shutdown)
|
// (this happens when the process is restarted after a crash or a shutdown)
|
||||||
tasksDone.set(workLog.countFinishedJobs());
|
tasksDone.set(workLog.countFinishedJobs());
|
||||||
|
|
||||||
|
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||||
|
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||||
|
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||||
|
List<CrawlTask> deferredTasks = new LinkedList<>();
|
||||||
|
|
||||||
// Create crawl tasks and submit them to the pool for execution
|
// Create crawl tasks and submit them to the pool for execution
|
||||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
var task = new CrawlTask(
|
// Add to the end of the deferral list
|
||||||
|
deferredTasks.addLast(new CrawlTask(
|
||||||
crawlSpec,
|
crawlSpec,
|
||||||
anchorTagsSource,
|
anchorTagsSource,
|
||||||
outputDir,
|
outputDir,
|
||||||
warcArchiver,
|
warcArchiver,
|
||||||
domainStateDb,
|
domainStateDb,
|
||||||
workLog);
|
workLog));
|
||||||
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
// Start every task we currently can from the deferral list
|
||||||
pool.submitQuietly(task);
|
deferredTasks.removeIf(task -> {
|
||||||
|
if (task.canRun()) {
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
||||||
|
return true; // task has already run, duplicate in crawl specs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This blocks the caller when the pool is full
|
||||||
|
pool.submitQuietly(task);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Schedule any lingering tasks for immediate execution
|
||||||
|
for (var task : deferredTasks) {
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
pool.submitQuietly(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||||
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.id = Integer.toHexString(domain.hashCode());
|
this.id = Integer.toHexString(domain.hashCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Best effort indicator whether we could start this now without getting stuck in
|
||||||
|
* DomainLocks purgatory */
|
||||||
|
public boolean canRun() {
|
||||||
|
return domainLocks.canLock(new EdgeDomain(domain));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
|
@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Recursively fetch sitemaps */
|
||||||
@Override
|
@Override
|
||||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||||
try {
|
try {
|
||||||
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||||
var head = sitemapQueue.removeFirst();
|
var head = sitemapQueue.removeFirst();
|
||||||
|
|
||||||
switch (fetchSitemap(head)) {
|
switch (fetchSingleSitemap(head)) {
|
||||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||||
|
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(sitemapUrl.asURI())
|
.uri(sitemapUrl.asURI())
|
||||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
|||||||
return new Semaphore(2);
|
return new Semaphore(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean canLock(EdgeDomain domain) {
|
||||||
|
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||||
|
if (null == sem)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return sem.availablePermits() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
public static class DomainLock implements AutoCloseable {
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
private final Semaphore semaphore;
|
private final Semaphore semaphore;
|
||||||
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
|
|
||||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user