1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-06 07:32:38 +02:00

Compare commits

...

12 Commits

Author SHA1 Message Date
Viktor Lofgren
9f18ced73d (crawler) Improve deferred task behavior 2025-03-18 12:54:18 +01:00
Viktor Lofgren
18e91269ab (crawler) Improve deferred task behavior 2025-03-18 12:25:22 +01:00
Viktor Lofgren
e315ca5758 (search) Change icon for small web filter
The previous icon was of an irregular size and shifted the layout in an unaesthetic way.
2025-03-17 12:07:34 +01:00
Viktor Lofgren
3ceea17c1d (search) Adjustments to devicd detection in CSS
Use pointer:fine media query to better distinguish between mobile devices and PCs with a window in portrait orientation.

With this, we never show mobile filtering functionality on mobile; and never show the touch-inaccessible minimized sidebar on mobile.
2025-03-17 12:04:34 +01:00
Viktor Lofgren
b34527c1a3 (search) Add small web filter for new UI 2025-03-17 11:39:19 +01:00
Viktor Lofgren
185bf28fca (crawler) Correct issue leading to parquet files not being correctly preconverted
Path.endsWith("str") != String.endsWith(".str")
2025-03-10 13:48:12 +01:00
Viktor Lofgren
78cc25584a (crawler) Add error logging when entering bad path for historical crawl data 2025-03-10 13:38:40 +01:00
Viktor Lofgren
62ba30bacf (common) Log info about metrics server 2025-03-10 13:12:39 +01:00
Viktor Lofgren
3bb84eb206 (common) Log info about metrics server 2025-03-10 13:03:48 +01:00
Viktor Lofgren
be7d13ccce (crawler) Correct task execution logic in crawler
The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.
2025-03-09 13:47:51 +01:00
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
Viktor Lofgren
ea9a642b9b (crawler) More effective task scheduling in the crawler
This should hopefully allow more threads to be busy
2025-03-09 11:44:59 +01:00
11 changed files with 96 additions and 15 deletions

View File

@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
while (nets.hasMoreElements()) { while (nets.hasMoreElements()) {
NetworkInterface netif = nets.nextElement(); NetworkInterface netif = nets.nextElement();
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
if (!netif.isUp() || netif.isLoopback()) { if (!netif.isUp() || netif.isLoopback()) {
continue; continue;
} }
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses(); Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
while (inetAddresses.hasMoreElements()) { while (inetAddresses.hasMoreElements()) {
InetAddress addr = inetAddresses.nextElement(); InetAddress addr = inetAddresses.nextElement();
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) { if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
return addr.getHostAddress(); return addr.getHostAddress();
} }

View File

@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
public class MetricsServer { public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class); private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject @Inject
public MetricsServer(ServiceConfiguration configuration) { public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics"); context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
server.start(); server.start();
} }
catch (Exception|NoSuchMethodError ex) { catch (Exception|NoSuchMethodError ex) {

View File

@@ -248,9 +248,14 @@ public class CrawlerMain extends ProcessMainClass {
// (this happens when the process is restarted after a crash or a shutdown) // (this happens when the process is restarted after a crash or a shutdown)
tasksDone.set(workLog.countFinishedJobs()); tasksDone.set(workLog.countFinishedJobs());
// Create crawl tasks and submit them to the pool for execution // List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
// this will more aggressively attempt to schedule the jobs to avoid blocking
List<CrawlTask> taskList = new ArrayList<>();
// Create crawl tasks
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) { for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
if (workLog.isJobFinished(crawlSpec.domain())) if (workLog.isJobFinished(crawlSpec.domain))
continue; continue;
var task = new CrawlTask( var task = new CrawlTask(
@@ -261,11 +266,22 @@ public class CrawlerMain extends ProcessMainClass {
domainStateDb, domainStateDb,
workLog); workLog);
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) { // Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
pool.submitQuietly(task); if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
taskList.add(task);
} }
} }
// Schedule viable tasks for execution until list is empty
while (!taskList.isEmpty()) {
taskList.removeIf(this::trySubmitDeferredTask);
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
// we might have no new viable tasks to run for hours on end
TimeUnit.MILLISECONDS.sleep(50);
}
logger.info("Shutting down the pool, waiting for tasks to complete..."); logger.info("Shutting down the pool, waiting for tasks to complete...");
pool.shutDown(); pool.shutDown();
@@ -290,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
} }
} }
/** Submit a task for execution if it can be run, returns true if it was submitted
* or if it can be discarded */
private boolean trySubmitDeferredTask(CrawlTask task) {
if (!task.canRun()) {
return false;
}
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
return true; // task has already run, duplicate in crawl specs
}
try {
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
}
catch (RuntimeException ex) {
logger.error("Failed to submit task " + task.domain, ex);
return false;
}
}
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception { public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath()); runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
} }
@@ -346,9 +384,20 @@ public class CrawlerMain extends ProcessMainClass {
this.id = Integer.toHexString(domain.hashCode()); this.id = Integer.toHexString(domain.hashCode());
} }
/** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */
public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain));
}
@Override @Override
public void run() throws Exception { public void run() throws Exception {
if (workLog.isJobFinished(domain)) { // No-Op
logger.info("Omitting task {}, as it is already run", domain);
return;
}
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain); Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -403,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
logger.error("Error fetching domain " + domain, e); logger.error("Error fetching domain " + domain, e);
} }
finally { finally {
// We don't need to double-count these; it's also kept int he workLog // We don't need to double-count these; it's also kept in the workLog
pendingCrawlTasks.remove(domain); pendingCrawlTasks.remove(domain);
Thread.currentThread().setName("[idle]"); Thread.currentThread().setName("[idle]");
@@ -494,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
// //
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead // This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException { private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
if (!inputPath.endsWith(".parquet")) { if (!inputPath.toString().endsWith(".parquet")) {
return inputPath; return inputPath;
} }

View File

@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever(); return new SitemapRetriever();
} }
/** Recursively fetch sitemaps */
@Override @Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) { public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try { try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) { while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst(); var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) { switch (fetchSingleSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> { case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) { for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException { private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder() HttpRequest getRequest = HttpRequest.newBuilder()
.GET() .GET()
.uri(sitemapUrl.asURI()) .uri(sitemapUrl.asURI())

View File

@@ -44,6 +44,14 @@ public class DomainLocks {
return new Semaphore(2); return new Semaphore(2);
} }
public boolean canLock(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem)
return true;
else
return sem.availablePermits() > 0;
}
public static class DomainLock implements AutoCloseable { public static class DomainLock implements AutoCloseable {
private final String domainName; private final String domainName;
private final Semaphore semaphore; private final Semaphore semaphore;

View File

@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
{ {
String fileName = fullPath.getFileName().toString(); String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) {
if (fileName.endsWith(".slop.zip")) {
try { try {
return new ParquetSerializableCrawlDataStream(fullPath); return new SlopSerializableCrawlDataStream(fullPath);
} catch (Exception ex) { } catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex); logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty(); return SerializableCrawlDataStream.empty();
} }
} }
if (fileName.endsWith(".slop.zip")) { else if (fileName.endsWith(".parquet")) {
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
try { try {
return new SlopSerializableCrawlDataStream(fullPath); return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) { } catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex); logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty(); return SerializableCrawlDataStream.empty();

View File

@@ -81,6 +81,7 @@ public class SearchFilters {
), ),
List.of( List.of(
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters), new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters), new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters) new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
), ),

View File

@@ -9,6 +9,14 @@
nicotine: '#f8f8ee', nicotine: '#f8f8ee',
margeblue: '#3e5f6f', margeblue: '#3e5f6f',
liteblue: '#0066cc', liteblue: '#0066cc',
},
screens: {
'coarsepointer': {
'raw': '(pointer: coarse)'
},
'finepointer': {
'raw': '(pointer: fine)'
},
} }
}, },
screens: { screens: {

View File

@@ -23,7 +23,7 @@
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters()) @template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
</div> </div>
<div class="grow"></div> <div class="grow"></div>
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button"> <button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
<i class="fas fa-filter mr-3"></i> <i class="fas fa-filter mr-3"></i>
Filters Filters
</button> </button>

View File

@@ -3,7 +3,7 @@
@param SearchFilters filters @param SearchFilters filters
<aside class="md:w-64 py-4 shrink-0 hidden sm:block"> <aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
<div class="space-y-6 sticky top-4"> <div class="space-y-6 sticky top-4">
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300"> <div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block"> <h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">

View File

@@ -9,6 +9,14 @@ module.exports = {
nicotine: '#f8f8ee', nicotine: '#f8f8ee',
margeblue: '#3e5f6f', margeblue: '#3e5f6f',
liteblue: '#0066cc', liteblue: '#0066cc',
},
screens: {
'coarsepointer': {
'raw': '(pointer: coarse)'
},
'finepointer': {
'raw': '(pointer: fine)'
},
} }
}, },
screens: { screens: {