1
1
mirror of https://github.com/MarginaliaSearch/MarginaliaSearch.git synced 2025-10-05 21:22:39 +02:00

Compare commits

...

12 Commits

Author SHA1 Message Date
Viktor Lofgren
9f18ced73d (crawler) Improve deferred task behavior 2025-03-18 12:54:18 +01:00
Viktor Lofgren
18e91269ab (crawler) Improve deferred task behavior 2025-03-18 12:25:22 +01:00
Viktor Lofgren
e315ca5758 (search) Change icon for small web filter
The previous icon was of an irregular size and shifted the layout in an unaesthetic way.
2025-03-17 12:07:34 +01:00
Viktor Lofgren
3ceea17c1d (search) Adjustments to devicd detection in CSS
Use pointer:fine media query to better distinguish between mobile devices and PCs with a window in portrait orientation.

With this, we never show mobile filtering functionality on mobile; and never show the touch-inaccessible minimized sidebar on mobile.
2025-03-17 12:04:34 +01:00
Viktor Lofgren
b34527c1a3 (search) Add small web filter for new UI 2025-03-17 11:39:19 +01:00
Viktor Lofgren
185bf28fca (crawler) Correct issue leading to parquet files not being correctly preconverted
Path.endsWith("str") != String.endsWith(".str")
2025-03-10 13:48:12 +01:00
Viktor Lofgren
78cc25584a (crawler) Add error logging when entering bad path for historical crawl data 2025-03-10 13:38:40 +01:00
Viktor Lofgren
62ba30bacf (common) Log info about metrics server 2025-03-10 13:12:39 +01:00
Viktor Lofgren
3bb84eb206 (common) Log info about metrics server 2025-03-10 13:03:48 +01:00
Viktor Lofgren
be7d13ccce (crawler) Correct task execution logic in crawler
The old behavior would flag domains as pending too soon, leading to them being omitted from execution if they were not immediately available to run.
2025-03-09 13:47:51 +01:00
Viktor Lofgren
8c088a7c0b (crawler) Remove custom thread factory
This was causing issues, and not really doing much of benefit.
2025-03-09 11:50:52 +01:00
Viktor Lofgren
ea9a642b9b (crawler) More effective task scheduling in the crawler
This should hopefully allow more threads to be busy
2025-03-09 11:44:59 +01:00
11 changed files with 96 additions and 15 deletions

View File

@@ -121,6 +121,7 @@ public class ServiceConfigurationModule extends AbstractModule {
while (nets.hasMoreElements()) {
NetworkInterface netif = nets.nextElement();
logger.info("Considering network interface {}: Up? {}, Loopback? {}", netif.getDisplayName(), netif.isUp(), netif.isLoopback());
if (!netif.isUp() || netif.isLoopback()) {
continue;
}
@@ -128,6 +129,7 @@ public class ServiceConfigurationModule extends AbstractModule {
Enumeration<InetAddress> inetAddresses = netif.getInetAddresses();
while (inetAddresses.hasMoreElements()) {
InetAddress addr = inetAddresses.nextElement();
logger.info("Considering address {}: SiteLocal? {}, Loopback? {}", addr.getHostAddress(), addr.isSiteLocalAddress(), addr.isLoopbackAddress());
if (addr.isSiteLocalAddress() && !addr.isLoopbackAddress()) {
return addr.getHostAddress();
}

View File

@@ -13,7 +13,7 @@ import java.net.InetSocketAddress;
public class MetricsServer {
private static Logger logger = LoggerFactory.getLogger(MetricsServer.class);
private static final Logger logger = LoggerFactory.getLogger(MetricsServer.class);
@Inject
public MetricsServer(ServiceConfiguration configuration) {
@@ -30,6 +30,8 @@ public class MetricsServer {
context.addServlet(new ServletHolder(new MetricsServlet()), "/metrics");
logger.info("MetricsServer listening on {}:{}", configuration.bindAddress(), configuration.metricsPort());
server.start();
}
catch (Exception|NoSuchMethodError ex) {

View File

@@ -248,9 +248,14 @@ public class CrawlerMain extends ProcessMainClass {
// (this happens when the process is restarted after a crash or a shutdown)
tasksDone.set(workLog.countFinishedJobs());
// Create crawl tasks and submit them to the pool for execution
// List of deferred tasks used to ensure beneficial scheduling of domains with regard to DomainLocks,
// merely shuffling the domains tends to lead to a lot of threads being blocked waiting for a semphore,
// this will more aggressively attempt to schedule the jobs to avoid blocking
List<CrawlTask> taskList = new ArrayList<>();
// Create crawl tasks
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
if (workLog.isJobFinished(crawlSpec.domain()))
if (workLog.isJobFinished(crawlSpec.domain))
continue;
var task = new CrawlTask(
@@ -261,11 +266,22 @@ public class CrawlerMain extends ProcessMainClass {
domainStateDb,
workLog);
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
pool.submitQuietly(task);
// Try to run immediately, to avoid unnecessarily keeping the entire work set in RAM
if (!trySubmitDeferredTask(task)) {
// Otherwise add to the taskList for deferred execution
taskList.add(task);
}
}
// Schedule viable tasks for execution until list is empty
while (!taskList.isEmpty()) {
taskList.removeIf(this::trySubmitDeferredTask);
// Add a small pause here to avoid busy looping toward the end of the execution cycle when
// we might have no new viable tasks to run for hours on end
TimeUnit.MILLISECONDS.sleep(50);
}
logger.info("Shutting down the pool, waiting for tasks to complete...");
pool.shutDown();
@@ -290,6 +306,28 @@ public class CrawlerMain extends ProcessMainClass {
}
}
/** Submit a task for execution if it can be run, returns true if it was submitted
* or if it can be discarded */
private boolean trySubmitDeferredTask(CrawlTask task) {
if (!task.canRun()) {
return false;
}
if (pendingCrawlTasks.putIfAbsent(task.domain, task) != null) {
return true; // task has already run, duplicate in crawl specs
}
try {
// This blocks the caller when the pool is full
pool.submitQuietly(task);
return true;
}
catch (RuntimeException ex) {
logger.error("Failed to submit task " + task.domain, ex);
return false;
}
}
public void runForSingleDomain(String targetDomainName, FileStorageId fileStorageId) throws Exception {
runForSingleDomain(targetDomainName, fileStorageService.getStorage(fileStorageId).asPath());
}
@@ -346,9 +384,20 @@ public class CrawlerMain extends ProcessMainClass {
this.id = Integer.toHexString(domain.hashCode());
}
/** Best effort indicator whether we could start this now without getting stuck in
* DomainLocks purgatory */
public boolean canRun() {
return domainLocks.canLock(new EdgeDomain(domain));
}
@Override
public void run() throws Exception {
if (workLog.isJobFinished(domain)) { // No-Op
logger.info("Omitting task {}, as it is already run", domain);
return;
}
Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE);
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
Path slopFile = CrawlerOutputFile.createSlopPath(outputDir, id, domain);
@@ -403,7 +452,7 @@ public class CrawlerMain extends ProcessMainClass {
logger.error("Error fetching domain " + domain, e);
}
finally {
// We don't need to double-count these; it's also kept int he workLog
// We don't need to double-count these; it's also kept in the workLog
pendingCrawlTasks.remove(domain);
Thread.currentThread().setName("[idle]");
@@ -494,7 +543,7 @@ public class CrawlerMain extends ProcessMainClass {
//
// This must be synchronized as chewing through parquet files in parallel leads to enormous memory overhead
private synchronized Path migrateParquetData(Path inputPath, String domain, Path crawlDataRoot) throws IOException {
if (!inputPath.endsWith(".parquet")) {
if (!inputPath.toString().endsWith(".parquet")) {
return inputPath;
}

View File

@@ -251,6 +251,7 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever();
}
/** Recursively fetch sitemaps */
@Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try {
@@ -270,7 +271,7 @@ public class HttpFetcherImpl implements HttpFetcher {
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) {
switch (fetchSingleSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) {
@@ -306,7 +307,7 @@ public class HttpFetcherImpl implements HttpFetcher {
}
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
private SitemapResult fetchSingleSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder()
.GET()
.uri(sitemapUrl.asURI())

View File

@@ -44,6 +44,14 @@ public class DomainLocks {
return new Semaphore(2);
}
public boolean canLock(EdgeDomain domain) {
Semaphore sem = locks.get(domain.topDomain.toLowerCase());
if (null == sem)
return true;
else
return sem.availablePermits() > 0;
}
public static class DomainLock implements AutoCloseable {
private final String domainName;
private final Semaphore semaphore;

View File

@@ -42,18 +42,20 @@ public interface SerializableCrawlDataStream extends AutoCloseable {
{
String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) {
if (fileName.endsWith(".slop.zip")) {
try {
return new ParquetSerializableCrawlDataStream(fullPath);
return new SlopSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();
}
}
if (fileName.endsWith(".slop.zip")) {
else if (fileName.endsWith(".parquet")) {
logger.error("Opening deprecated parquet-style crawl data stream", new Exception());
try {
return new SlopSerializableCrawlDataStream(fullPath);
return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();

View File

@@ -81,6 +81,7 @@ public class SearchFilters {
),
List.of(
new Filter("Vintage", "fa-clock-rotate-left", SearchProfile.VINTAGE, parameters),
new Filter("Small Web", "fa-minus", SearchProfile.SMALLWEB, parameters),
new Filter("Plain Text", "fa-file", SearchProfile.PLAIN_TEXT, parameters),
new Filter("Tilde", "fa-house", SearchProfile.TILDE, parameters)
),

View File

@@ -9,6 +9,14 @@
nicotine: '#f8f8ee',
margeblue: '#3e5f6f',
liteblue: '#0066cc',
},
screens: {
'coarsepointer': {
'raw': '(pointer: coarse)'
},
'finepointer': {
'raw': '(pointer: fine)'
},
}
},
screens: {

View File

@@ -23,7 +23,7 @@
@template.serp.part.searchform(query = results.getParams().query(), profile = results.getProfile(), filters = results.getFilters())
</div>
<div class="grow"></div>
<button class="fixed bottom-10 right-5 sm:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
<button class="fixed bottom-10 right-5 finepointer:hidden md:hidden text-sm bg-margeblue text-white p-4 rounded-xl active:text-slate-200" id="filter-button">
<i class="fas fa-filter mr-3"></i>
Filters
</button>

View File

@@ -3,7 +3,7 @@
@param SearchFilters filters
<aside class="md:w-64 py-4 shrink-0 hidden sm:block">
<aside class="md:w-64 py-4 shrink-0 hidden md:block finepointer:block">
<div class="space-y-6 sticky top-4">
<div class="bg-white dark:bg-gray-800 p-4 border dark:border-gray-600 border-gray-300">
<h2 class="font-medium mb-3 flex items-center font-serif hidden md:block">

View File

@@ -9,6 +9,14 @@ module.exports = {
nicotine: '#f8f8ee',
margeblue: '#3e5f6f',
liteblue: '#0066cc',
},
screens: {
'coarsepointer': {
'raw': '(pointer: coarse)'
},
'finepointer': {
'raw': '(pointer: fine)'
},
}
},
screens: {