mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-10-06 07:32:38 +02:00
Compare commits
8 Commits
deploy-009
...
deploy-009
Author | SHA1 | Date | |
---|---|---|---|
|
185bf28fca | ||
|
78cc25584a | ||
|
62ba30bacf | ||
|
3bb84eb206 | ||
|
be7d13ccce | ||
|
8c088a7c0b | ||
|
ea9a642b9b | ||
|
27f528af6a |
@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
|
|
||||||
while (nets.hasMoreElements()) {
|
while (nets.hasMoreElements()) {
|
||||||
NetworkInterface netif = nets.nextElement();
|
NetworkInterface netif = nets.nextElement();
|
||||||
|
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
|
||||||
if (!netif.isUp() || netif.isLoopback()) {
|
if (!netif.isUp() || netif.isLoopback()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
|
|||||||
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
|
||||||
while (inetAddresses.hasMoreElements()) {
|
while (inetAddresses.hasMoreElements()) {
|
||||||
InetAddress addr = inetAddresses.nextElement();
|
InetAddress addr = inetAddresses.nextElement();
|
||||||
|
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
|
||||||
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
|
||||||
return addr.getHostAddress();
|
return addr.getHostAddress();
|
||||||
}
|
}
|
||||||
|
@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
|
|||||||
|
|
||||||
public class MetricsServer {
|
public class MetricsServer {
|
||||||
|
|
||||||
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public MetricsServer(ServiceConfiguration configuration) {
|
public MetricsServer(ServiceConfiguration configuration) {
|
||||||
@@ -30,6 +30,8 @@ public class MetricsServer {
|
|||||||
|
|
||||||
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
|
||||||
|
|
||||||
|
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
|
||||||
|
|
||||||
server.start();
|
server.start();
|
||||||
}
|
}
|
||||||
catch (Exception|NoSuchMethodError ex) {
|
catch (Exception|NoSuchMethodError ex) {
|
||||||
|
@@ -41,10 +41,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@@ -248,22 +245,47 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// (this happens when the process is restarted after a crash or a shutdown)
|
// (this happens when the process is restarted after a crash or a shutdown)
|
||||||
tasksDone.set(workLog.countFinishedJobs());
|
tasksDone.set(workLog.countFinishedJobs());
|
||||||
|
|
||||||
|
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
|
||||||
|
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
|
||||||
|
// this will more aggressively attempt to schedule the jobs to avoid blocking
|
||||||
|
List<CrawlTask> deferredTasks = new LinkedList<>();
|
||||||
|
|
||||||
// Create crawl tasks and submit them to the pool for execution
|
// Create crawl tasks and submit them to the pool for execution
|
||||||
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||||
if (workLog.isJobFinished(crawlSpec.domain()))
|
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
var task = new CrawlTask(
|
// Add to the end of the deferral list
|
||||||
|
deferredTasks.addLast(new CrawlTask(
|
||||||
crawlSpec,
|
crawlSpec,
|
||||||
anchorTagsSource,
|
anchorTagsSource,
|
||||||
outputDir,
|
outputDir,
|
||||||
warcArchiver,
|
warcArchiver,
|
||||||
domainStateDb,
|
domainStateDb,
|
||||||
workLog);
|
workLog));
|
||||||
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
// Start every task we currently can from the deferral list
|
||||||
pool.submitQuietly(task);
|
deferredTasks.removeIf(task -> {
|
||||||
}
|
if (task.canRun()) {
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) != null) {
|
||||||
|
return true; // task has already run, duplicate in crawl specs
|
||||||
|
}
|
||||||
|
|
||||||
|
// This blocks the caller when the pool is full
|
||||||
|
pool.submitQuietly(task);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Schedule any lingering tasks for immediate execution
|
||||||
|
for (var task : deferredTasks) {
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
pool.submitQuietly(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||||
@@ -346,6 +368,12 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
this.id = Integer.toHexString(domain.hashCode());
|
this.id = Integer.toHexString(domain.hashCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Best effort indicator whether we could start this now without getting stuck in
|
||||||
|
* DomainLocks purgatory */
|
||||||
|
public boolean canRun() {
|
||||||
|
return domainLocks.canLock(new EdgeDomain(domain));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
@@ -494,7 +522,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
//
|
//
|
||||||
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
|
||||||
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
|
||||||
if (!inputPath.endsWith(".parquet")) {
|
if (!inputPath.toString().endsWith(".parquet")) {
|
||||||
return inputPath;
|
return inputPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Recursively fetch sitemaps */
|
||||||
@Override
|
@Override
|
||||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||||
try {
|
try {
|
||||||
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||||
var head = sitemapQueue.removeFirst();
|
var head = sitemapQueue.removeFirst();
|
||||||
|
|
||||||
switch (fetchSitemap(head)) {
|
switch (fetchSingleSitemap(head)) {
|
||||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||||
|
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||||
.GET()
|
.GET()
|
||||||
.uri(sitemapUrl.asURI())
|
.uri(sitemapUrl.asURI())
|
||||||
|
@@ -44,6 +44,14 @@ public class DomainLocks {
|
|||||||
return new Semaphore(2);
|
return new Semaphore(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean canLock(EdgeDomain domain) {
|
||||||
|
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
|
||||||
|
if (null == sem)
|
||||||
|
return true;
|
||||||
|
else
|
||||||
|
return sem.availablePermits() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
public static class DomainLock implements AutoCloseable {
|
public static class DomainLock implements AutoCloseable {
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
private final Semaphore semaphore;
|
private final Semaphore semaphore;
|
||||||
|
@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
|
|||||||
{
|
{
|
||||||
|
|
||||||
String fileName = fullPath.getFileName().toString();
|
String fileName = fullPath.getFileName().toString();
|
||||||
if (fileName.endsWith(".parquet")) {
|
|
||||||
|
if (fileName.endsWith(".slop.zip")) {
|
||||||
try {
|
try {
|
||||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
return new SlopSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fileName.endsWith(".slop.zip")) {
|
else if (fileName.endsWith(".parquet")) {
|
||||||
|
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
|
||||||
try {
|
try {
|
||||||
return new SlopSerializableCrawlDataStream(fullPath);
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error reading domain data from " + fullPath, ex);
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
return SerializableCrawlDataStream.empty();
|
return SerializableCrawlDataStream.empty();
|
||||||
|
@@ -7,8 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
@@ -20,7 +19,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
@@ -7,9 +7,7 @@ import java.util.Arrays;
|
|||||||
|
|
||||||
public enum SearchJsParameter {
|
public enum SearchJsParameter {
|
||||||
DEFAULT("default"),
|
DEFAULT("default"),
|
||||||
DENY_JS("no-js", "js:true"),
|
DENY_JS("no-js", "special:scripts");
|
||||||
REQUIRE_JS("yes-js", "js:false");
|
|
||||||
|
|
||||||
public final String value;
|
public final String value;
|
||||||
public final String[] implictExcludeSearchTerms;
|
public final String[] implictExcludeSearchTerms;
|
||||||
|
|
||||||
@@ -20,7 +18,6 @@ public enum SearchJsParameter {
|
|||||||
|
|
||||||
public static SearchJsParameter parse(@Nullable String value) {
|
public static SearchJsParameter parse(@Nullable String value) {
|
||||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
|
||||||
|
|
||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user